1 module General = struct
2 (******************************************************************************)
3 (* *)
4 (* Menhir *)
5 (* *)
6 (* François Pottier, Inria Paris *)
7 (* Yann Régis-Gianas, PPS, Université Paris Diderot *)
8 (* *)
9 (* Copyright Inria. All rights reserved. This file is distributed under the *)
10 (* terms of the GNU Library General Public License version 2, with a *)
11 (* special exception on linking, as described in the file LICENSE. *)
12 (* *)
13 (******************************************************************************)
14
15 (* --------------------------------------------------------------------------- *)
16
17 (* Lists. *)
18
19 let rec take n xs =
20 match n, xs with
21 | 0, _
22 | _, [] ->
23 []
24 | _, (x :: xs as input) ->
25 let xs' = take (n - 1) xs in
26 if xs == xs' then
27 input
28 else
29 x :: xs'
30
31 let rec drop n xs =
32 match n, xs with
33 | 0, _ ->
34 xs
35 | _, [] ->
36 []
37 | _, _ :: xs ->
38 drop (n - 1) xs
39
40 let rec uniq1 cmp x ys =
41 match ys with
42 | [] ->
43 []
44 | y :: ys ->
45 if cmp x y = 0 then
46 uniq1 compare x ys
47 else
48 y :: uniq1 cmp y ys
49
50 let uniq cmp xs =
51 match xs with
52 | [] ->
53 []
54 | x :: xs ->
55 x :: uniq1 cmp x xs
56
57 let weed cmp xs =
58 uniq cmp (List.sort cmp xs)
59
60 (* --------------------------------------------------------------------------- *)
61
62 (* Streams. *)
63
64 type 'a stream =
65 'a head Lazy.t
66
67 and 'a head =
68 | Nil
69 | Cons of 'a * 'a stream
70
71 (* The length of a stream. *)
72
73 let rec length xs =
74 match Lazy.force xs with
75 | Nil ->
76 0
77 | Cons (_, xs) ->
78 1 + length xs
79
80 (* Folding over a stream. *)
81
82 let rec foldr f xs accu =
83 match Lazy.force xs with
84 | Nil ->
85 accu
86 | Cons (x, xs) ->
87 f x (foldr f xs accu)
88
89 end
90 module Convert = struct
91 (******************************************************************************)
92 (* *)
93 (* Menhir *)
94 (* *)
95 (* François Pottier, Inria Paris *)
96 (* Yann Régis-Gianas, PPS, Université Paris Diderot *)
97 (* *)
98 (* Copyright Inria. All rights reserved. This file is distributed under the *)
99 (* terms of the GNU Library General Public License version 2, with a *)
100 (* special exception on linking, as described in the file LICENSE. *)
101 (* *)
102 (******************************************************************************)
103
104 (* An ocamlyacc-style, or Menhir-style, parser requires access to
105 the lexer, which must be parameterized with a lexing buffer, and
106 to the lexing buffer itself, where it reads position information. *)
107
108 (* This traditional API is convenient when used with ocamllex, but
109 inelegant when used with other lexer generators. *)
110
111 type ('token, 'semantic_value) traditional =
112 (Lexing.lexbuf -> 'token) -> Lexing.lexbuf -> 'semantic_value
113
114 (* This revised API is independent of any lexer generator. Here, the
115 parser only requires access to the lexer, and the lexer takes no
116 parameters. The tokens returned by the lexer may contain position
117 information. *)
118
119 type ('token, 'semantic_value) revised =
120 (unit -> 'token) -> 'semantic_value
121
122 (* --------------------------------------------------------------------------- *)
123
124 (* Converting a traditional parser, produced by ocamlyacc or Menhir,
125 into a revised parser. *)
126
127 (* A token of the revised lexer is essentially a triple of a token
128 of the traditional lexer (or raw token), a start position, and
129 and end position. The three [get] functions are accessors. *)
130
131 (* We do not require the type ['token] to actually be a triple type.
132 This enables complex applications where it is a record type with
133 more than three fields. It also enables simple applications where
134 positions are of no interest, so ['token] is just ['raw_token]
135 and [get_startp] and [get_endp] return dummy positions. *)
136
137 let traditional2revised
138 (get_raw_token : 'token -> 'raw_token)
139 (get_startp : 'token -> Lexing.position)
140 (get_endp : 'token -> Lexing.position)
141 (parser : ('raw_token, 'semantic_value) traditional)
142 : ('token, 'semantic_value) revised =
143
144 (* Accept a revised lexer. *)
145
146 fun (lexer : unit -> 'token) ->
147
148 (* Create a dummy lexing buffer. *)
149
150 let lexbuf : Lexing.lexbuf =
151 Lexing.from_string ""
152 in
153
154 (* Wrap the revised lexer as a traditional lexer. A traditional
155 lexer returns a raw token and updates the fields of the lexing
156 buffer with new positions, which will be read by the parser. *)
157
158 let lexer (lexbuf : Lexing.lexbuf) : 'raw_token =
159 let token : 'token = lexer() in
160 lexbuf.Lexing.lex_start_p <- get_startp token;
161 lexbuf.Lexing.lex_curr_p <- get_endp token;
162 get_raw_token token
163 in
164
165 (* Invoke the traditional parser. *)
166
167 parser lexer lexbuf
168
169 (* --------------------------------------------------------------------------- *)
170
171 (* Converting a revised parser back to a traditional parser. *)
172
173 let revised2traditional
174 (make_token : 'raw_token -> Lexing.position -> Lexing.position -> 'token)
175 (parser : ('token, 'semantic_value) revised)
176 : ('raw_token, 'semantic_value) traditional =
177
178 (* Accept a traditional lexer and a lexing buffer. *)
179
180 fun (lexer : Lexing.lexbuf -> 'raw_token) (lexbuf : Lexing.lexbuf) ->
181
182 (* Wrap the traditional lexer as a revised lexer. *)
183
184 let lexer () : 'token =
185 let token : 'raw_token = lexer lexbuf in
186 make_token token lexbuf.Lexing.lex_start_p lexbuf.Lexing.lex_curr_p
187 in
188
189 (* Invoke the revised parser. *)
190
191 parser lexer
192
193 (* --------------------------------------------------------------------------- *)
194
195 (* Simplified versions of the above, where concrete triples are used. *)
196
197 module Simplified = struct
198
199 let traditional2revised parser =
200 traditional2revised
201 (fun (token, _, _) -> token)
202 (fun (_, startp, _) -> startp)
203 (fun (_, _, endp) -> endp)
204 parser
205
206 let revised2traditional parser =
207 revised2traditional
208 (fun token startp endp -> (token, startp, endp))
209 parser
210
211 end
212 end
213 module IncrementalEngine = struct
214 (******************************************************************************)
215 (* *)
216 (* Menhir *)
217 (* *)
218 (* François Pottier, Inria Paris *)
219 (* Yann Régis-Gianas, PPS, Université Paris Diderot *)
220 (* *)
221 (* Copyright Inria. All rights reserved. This file is distributed under the *)
222 (* terms of the GNU Library General Public License version 2, with a *)
223 (* special exception on linking, as described in the file LICENSE. *)
224 (* *)
225 (******************************************************************************)
226
227 type position = Lexing.position
228
229 open General
230
231 (* This signature describes the incremental LR engine. *)
232
233 (* In this mode, the user controls the lexer, and the parser suspends
234 itself when it needs to read a new token. *)
235
236 module type INCREMENTAL_ENGINE = sig
237
238 type token
239
240 (* A value of type [production] is (an index for) a production. The start
241 productions (which do not exist in an \mly file, but are constructed by
242 Menhir internally) are not part of this type. *)
243
244 type production
245
246 (* The type ['a checkpoint] represents an intermediate or final state of the
247 parser. An intermediate checkpoint is a suspension: it records the parser's
248 current state, and allows parsing to be resumed. The parameter ['a] is
249 the type of the semantic value that will eventually be produced if the
250 parser succeeds. *)
251
252 (* [Accepted] and [Rejected] are final checkpoints. [Accepted] carries a
253 semantic value. *)
254
255 (* [InputNeeded] is an intermediate checkpoint. It means that the parser wishes
256 to read one token before continuing. *)
257
258 (* [Shifting] is an intermediate checkpoint. It means that the parser is taking
259 a shift transition. It exposes the state of the parser before and after
260 the transition. The Boolean parameter tells whether the parser intends to
261 request a new token after this transition. (It always does, except when
262 it is about to accept.) *)
263
264 (* [AboutToReduce] is an intermediate checkpoint. It means that the parser is
265 about to perform a reduction step. It exposes the parser's current
266 state as well as the production that is about to be reduced. *)
267
268 (* [HandlingError] is an intermediate checkpoint. It means that the parser has
269 detected an error and is currently handling it, in several steps. *)
270
271 (* A value of type ['a env] represents a configuration of the automaton:
272 current state, stack, lookahead token, etc. The parameter ['a] is the
273 type of the semantic value that will eventually be produced if the parser
274 succeeds. *)
275
276 (* In normal operation, the parser works with checkpoints: see the functions
277 [offer] and [resume]. However, it is also possible to work directly with
278 environments (see the functions [pop], [force_reduction], and [feed]) and
279 to reconstruct a checkpoint out of an environment (see [input_needed]).
280 This is considered advanced functionality; its purpose is to allow error
281 recovery strategies to be programmed by the user. *)
282
283 type 'a env
284
285 type 'a checkpoint = private
286 | InputNeeded of 'a env
287 | Shifting of 'a env * 'a env * bool
288 | AboutToReduce of 'a env * production
289 | HandlingError of 'a env
290 | Accepted of 'a
291 | Rejected
292
293 (* [offer] allows the user to resume the parser after it has suspended
294 itself with a checkpoint of the form [InputNeeded env]. [offer] expects the
295 old checkpoint as well as a new token and produces a new checkpoint. It does not
296 raise any exception. *)
297
298 val offer:
299 'a checkpoint ->
300 token * position * position ->
301 'a checkpoint
302
303 (* [resume] allows the user to resume the parser after it has suspended
304 itself with a checkpoint of the form [AboutToReduce (env, prod)] or
305 [HandlingError env]. [resume] expects the old checkpoint and produces a new
306 checkpoint. It does not raise any exception. *)
307
308 val resume:
309 'a checkpoint ->
310 'a checkpoint
311
312 (* A token supplier is a function of no arguments which delivers a new token
313 (together with its start and end positions) every time it is called. *)
314
315 type supplier =
316 unit -> token * position * position
317
318 (* A pair of a lexer and a lexing buffer can be easily turned into a supplier. *)
319
320 val lexer_lexbuf_to_supplier:
321 (Lexing.lexbuf -> token) ->
322 Lexing.lexbuf ->
323 supplier
324
325 (* The functions [offer] and [resume] are sufficient to write a parser loop.
326 One can imagine many variations (which is why we expose these functions
327 in the first place!). Here, we expose a few variations of the main loop,
328 ready for use. *)
329
330 (* [loop supplier checkpoint] begins parsing from [checkpoint], reading
331 tokens from [supplier]. It continues parsing until it reaches a
332 checkpoint of the form [Accepted v] or [Rejected]. In the former case, it
333 returns [v]. In the latter case, it raises the exception [Error]. *)
334
335 val loop: supplier -> 'a checkpoint -> 'a
336
337 (* [loop_handle succeed fail supplier checkpoint] begins parsing from
338 [checkpoint], reading tokens from [supplier]. It continues parsing until
339 it reaches a checkpoint of the form [Accepted v] or [HandlingError env]
340 (or [Rejected], but that should not happen, as [HandlingError _] will be
341 observed first). In the former case, it calls [succeed v]. In the latter
342 case, it calls [fail] with this checkpoint. It cannot raise [Error].
343
344 This means that Menhir's traditional error-handling procedure (which pops
345 the stack until a state that can act on the [error] token is found) does
346 not get a chance to run. Instead, the user can implement her own error
347 handling code, in the [fail] continuation. *)
348
349 val loop_handle:
350 ('a -> 'answer) ->
351 ('a checkpoint -> 'answer) ->
352 supplier -> 'a checkpoint -> 'answer
353
354 (* [loop_handle_undo] is analogous to [loop_handle], except it passes a pair
355 of checkpoints to the failure continuation.
356
357 The first (and oldest) checkpoint is the last [InputNeeded] checkpoint that
358 was encountered before the error was detected. The second (and newest)
359 checkpoint is where the error was detected, as in [loop_handle]. Going back
360 to the first checkpoint can be thought of as undoing any reductions that
361 were performed after seeing the problematic token. (These reductions must
362 be default reductions or spurious reductions.)
363
364 [loop_handle_undo] must initially be applied to an [InputNeeded] checkpoint.
365 The parser's initial checkpoints satisfy this constraint. *)
366
367 val loop_handle_undo:
368 ('a -> 'answer) ->
369 ('a checkpoint -> 'a checkpoint -> 'answer) ->
370 supplier -> 'a checkpoint -> 'answer
371
372 (* [shifts checkpoint] assumes that [checkpoint] has been obtained by
373 submitting a token to the parser. It runs the parser from [checkpoint],
374 through an arbitrary number of reductions, until the parser either
375 accepts this token (i.e., shifts) or rejects it (i.e., signals an error).
376 If the parser decides to shift, then [Some env] is returned, where [env]
377 is the parser's state just before shifting. Otherwise, [None] is
378 returned. *)
379
380 (* It is desirable that the semantic actions be side-effect free, or that
381 their side-effects be harmless (replayable). *)
382
383 val shifts: 'a checkpoint -> 'a env option
384
385 (* The function [acceptable] allows testing, after an error has been
386 detected, which tokens would have been accepted at this point. It is
387 implemented using [shifts]. Its argument should be an [InputNeeded]
388 checkpoint. *)
389
390 (* For completeness, one must undo any spurious reductions before carrying out
391 this test -- that is, one must apply [acceptable] to the FIRST checkpoint
392 that is passed by [loop_handle_undo] to its failure continuation. *)
393
394 (* This test causes some semantic actions to be run! The semantic actions
395 should be side-effect free, or their side-effects should be harmless. *)
396
397 (* The position [pos] is used as the start and end positions of the
398 hypothetical token, and may be picked up by the semantic actions. We
399 suggest using the position where the error was detected. *)
400
401 val acceptable: 'a checkpoint -> token -> position -> bool
402
403 (* The abstract type ['a lr1state] describes the non-initial states of the
404 LR(1) automaton. The index ['a] represents the type of the semantic value
405 associated with this state's incoming symbol. *)
406
407 type 'a lr1state
408
409 (* The states of the LR(1) automaton are numbered (from 0 and up). *)
410
411 val number: _ lr1state -> int
412
413 (* Productions are numbered. *)
414
415 (* [find_production i] requires the index [i] to be valid. Use with care. *)
416
417 val production_index: production -> int
418 val find_production: int -> production
419
420 (* An element is a pair of a non-initial state [s] and a semantic value [v]
421 associated with the incoming symbol of this state. The idea is, the value
422 [v] was pushed onto the stack just before the state [s] was entered. Thus,
423 for some type ['a], the state [s] has type ['a lr1state] and the value [v]
424 has type ['a]. In other words, the type [element] is an existential type. *)
425
426 type element =
427 | Element: 'a lr1state * 'a * position * position -> element
428
429 (* The parser's stack is (or, more precisely, can be viewed as) a stream of
430 elements. The type [stream] is defined by the module [General]. *)
431
432 (* As of 2017/03/31, the types [stream] and [stack] and the function [stack]
433 are DEPRECATED. They might be removed in the future. An alternative way
434 of inspecting the stack is via the functions [top] and [pop]. *)
435
436 type stack = (* DEPRECATED *)
437 element stream
438
439 (* This is the parser's stack, a stream of elements. This stream is empty if
440 the parser is in an initial state; otherwise, it is non-empty. The LR(1)
441 automaton's current state is the one found in the top element of the
442 stack. *)
443
444 val stack: 'a env -> stack (* DEPRECATED *)
445
446 (* [top env] returns the parser's top stack element. The state contained in
447 this stack element is the current state of the automaton. If the stack is
448 empty, [None] is returned. In that case, the current state of the
449 automaton must be an initial state. *)
450
451 val top: 'a env -> element option
452
453 (* [pop_many i env] pops [i] cells off the automaton's stack. This is done
454 via [i] successive invocations of [pop]. Thus, [pop_many 1] is [pop]. The
455 index [i] must be nonnegative. The time complexity is O(i). *)
456
457 val pop_many: int -> 'a env -> 'a env option
458
459 (* [get i env] returns the parser's [i]-th stack element. The index [i] is
460 0-based: thus, [get 0] is [top]. If [i] is greater than or equal to the
461 number of elements in the stack, [None] is returned. The time complexity
462 is O(i). *)
463
464 val get: int -> 'a env -> element option
465
466 (* [current_state_number env] is (the integer number of) the automaton's
467 current state. This works even if the automaton's stack is empty, in
468 which case the current state is an initial state. This number can be
469 passed as an argument to a [message] function generated by [menhir
470 --compile-errors]. *)
471
472 val current_state_number: 'a env -> int
473
474 (* [equal env1 env2] tells whether the parser configurations [env1] and
475 [env2] are equal in the sense that the automaton's current state is the
476 same in [env1] and [env2] and the stack is *physically* the same in
477 [env1] and [env2]. If [equal env1 env2] is [true], then the sequence of
478 the stack elements, as observed via [pop] and [top], must be the same in
479 [env1] and [env2]. Also, if [equal env1 env2] holds, then the checkpoints
480 [input_needed env1] and [input_needed env2] must be equivalent. The
481 function [equal] has time complexity O(1). *)
482
483 val equal: 'a env -> 'a env -> bool
484
485 (* These are the start and end positions of the current lookahead token. If
486 invoked in an initial state, this function returns a pair of twice the
487 initial position. *)
488
489 val positions: 'a env -> position * position
490
491 (* When applied to an environment taken from a checkpoint of the form
492 [AboutToReduce (env, prod)], the function [env_has_default_reduction]
493 tells whether the reduction that is about to take place is a default
494 reduction. *)
495
496 val env_has_default_reduction: 'a env -> bool
497
498 (* [state_has_default_reduction s] tells whether the state [s] has a default
499 reduction. This includes the case where [s] is an accepting state. *)
500
501 val state_has_default_reduction: _ lr1state -> bool
502
503 (* [pop env] returns a new environment, where the parser's top stack cell
504 has been popped off. (If the stack is empty, [None] is returned.) This
505 amounts to pretending that the (terminal or nonterminal) symbol that
506 corresponds to this stack cell has not been read. *)
507
508 val pop: 'a env -> 'a env option
509
510 (* [force_reduction prod env] should be called only if in the state [env]
511 the parser is capable of reducing the production [prod]. If this
512 condition is satisfied, then this production is reduced, which means that
513 its semantic action is executed (this can have side effects!) and the
514 automaton makes a goto (nonterminal) transition. If this condition is not
515 satisfied, [Invalid_argument _] is raised. *)
516
517 val force_reduction: production -> 'a env -> 'a env
518
519 (* [input_needed env] returns [InputNeeded env]. That is, out of an [env]
520 that might have been obtained via a series of calls to the functions
521 [pop], [force_reduction], [feed], etc., it produces a checkpoint, which
522 can be used to resume normal parsing, by supplying this checkpoint as an
523 argument to [offer]. *)
524
525 (* This function should be used with some care. It could "mess up the
526 lookahead" in the sense that it allows parsing to resume in an arbitrary
527 state [s] with an arbitrary lookahead symbol [t], even though Menhir's
528 reachability analysis (menhir --list-errors) might well think that it is
529 impossible to reach this particular configuration. If one is using
530 Menhir's new error reporting facility, this could cause the parser to
531 reach an error state for which no error message has been prepared. *)
532
533 val input_needed: 'a env -> 'a checkpoint
534
535 end
536
537 (* This signature is a fragment of the inspection API that is made available
538 to the user when [--inspection] is used. This fragment contains type
539 definitions for symbols. *)
540
541 module type SYMBOLS = sig
542
543 (* The type ['a terminal] represents a terminal symbol. The type ['a
544 nonterminal] represents a nonterminal symbol. In both cases, the index
545 ['a] represents the type of the semantic values associated with this
546 symbol. The concrete definitions of these types are generated. *)
547
548 type 'a terminal
549 type 'a nonterminal
550
551 (* The type ['a symbol] represents a terminal or nonterminal symbol. It is
552 the disjoint union of the types ['a terminal] and ['a nonterminal]. *)
553
554 type 'a symbol =
555 | T : 'a terminal -> 'a symbol
556 | N : 'a nonterminal -> 'a symbol
557
558 (* The type [xsymbol] is an existentially quantified version of the type
559 ['a symbol]. This type is useful in situations where the index ['a]
560 is not statically known. *)
561
562 type xsymbol =
563 | X : 'a symbol -> xsymbol
564
565 end
566
567 (* This signature describes the inspection API that is made available to the
568 user when [--inspection] is used. *)
569
570 module type INSPECTION = sig
571
572 (* The types of symbols are described above. *)
573
574 include SYMBOLS
575
576 (* The type ['a lr1state] is meant to be the same as in [INCREMENTAL_ENGINE]. *)
577
578 type 'a lr1state
579
580 (* The type [production] is meant to be the same as in [INCREMENTAL_ENGINE].
581 It represents a production of the grammar. A production can be examined
582 via the functions [lhs] and [rhs] below. *)
583
584 type production
585
586 (* An LR(0) item is a pair of a production [prod] and a valid index [i] into
587 this production. That is, if the length of [rhs prod] is [n], then [i] is
588 comprised between 0 and [n], inclusive. *)
589
590 type item =
591 production * int
592
593 (* Ordering functions. *)
594
595 val compare_terminals: _ terminal -> _ terminal -> int
596 val compare_nonterminals: _ nonterminal -> _ nonterminal -> int
597 val compare_symbols: xsymbol -> xsymbol -> int
598 val compare_productions: production -> production -> int
599 val compare_items: item -> item -> int
600
601 (* [incoming_symbol s] is the incoming symbol of the state [s], that is,
602 the symbol that the parser must recognize before (has recognized when)
603 it enters the state [s]. This function gives access to the semantic
604 value [v] stored in a stack element [Element (s, v, _, _)]. Indeed,
605 by case analysis on the symbol [incoming_symbol s], one discovers the
606 type ['a] of the value [v]. *)
607
608 val incoming_symbol: 'a lr1state -> 'a symbol
609
610 (* [items s] is the set of the LR(0) items in the LR(0) core of the LR(1)
611 state [s]. This set is not epsilon-closed. This set is presented as a
612 list, in an arbitrary order. *)
613
614 val items: _ lr1state -> item list
615
616 (* [lhs prod] is the left-hand side of the production [prod]. This is
617 always a non-terminal symbol. *)
618
619 val lhs: production -> xsymbol
620
621 (* [rhs prod] is the right-hand side of the production [prod]. This is
622 a (possibly empty) sequence of (terminal or nonterminal) symbols. *)
623
624 val rhs: production -> xsymbol list
625
626 (* [nullable nt] tells whether the non-terminal symbol [nt] is nullable.
627 That is, it is true if and only if this symbol produces the empty
628 word [epsilon]. *)
629
630 val nullable: _ nonterminal -> bool
631
632 (* [first nt t] tells whether the FIRST set of the nonterminal symbol [nt]
633 contains the terminal symbol [t]. That is, it is true if and only if
634 [nt] produces a word that begins with [t]. *)
635
636 val first: _ nonterminal -> _ terminal -> bool
637
638 (* [xfirst] is analogous to [first], but expects a first argument of type
639 [xsymbol] instead of [_ terminal]. *)
640
641 val xfirst: xsymbol -> _ terminal -> bool
642
643 (* [foreach_terminal] enumerates the terminal symbols, including [error].
644 [foreach_terminal_but_error] enumerates the terminal symbols, excluding
645 [error]. *)
646
647 val foreach_terminal: (xsymbol -> 'a -> 'a) -> 'a -> 'a
648 val foreach_terminal_but_error: (xsymbol -> 'a -> 'a) -> 'a -> 'a
649
650 (* The type [env] is meant to be the same as in [INCREMENTAL_ENGINE]. *)
651
652 type 'a env
653
654 (* [feed symbol startp semv endp env] causes the parser to consume the
655 (terminal or nonterminal) symbol [symbol], accompanied with the semantic
656 value [semv] and with the start and end positions [startp] and [endp].
657 Thus, the automaton makes a transition, and reaches a new state. The
658 stack grows by one cell. This operation is permitted only if the current
659 state (as determined by [env]) has an outgoing transition labeled with
660 [symbol]. Otherwise, [Invalid_argument _] is raised. *)
661
662 val feed: 'a symbol -> position -> 'a -> position -> 'b env -> 'b env
663
664 end
665
666 (* This signature combines the incremental API and the inspection API. *)
667
668 module type EVERYTHING = sig
669
670 include INCREMENTAL_ENGINE
671
672 include INSPECTION
673 with type 'a lr1state := 'a lr1state
674 with type production := production
675 with type 'a env := 'a env
676
677 end
678 end
679 module EngineTypes = struct
680 (******************************************************************************)
681 (* *)
682 (* Menhir *)
683 (* *)
684 (* François Pottier, Inria Paris *)
685 (* Yann Régis-Gianas, PPS, Université Paris Diderot *)
686 (* *)
687 (* Copyright Inria. All rights reserved. This file is distributed under the *)
688 (* terms of the GNU Library General Public License version 2, with a *)
689 (* special exception on linking, as described in the file LICENSE. *)
690 (* *)
691 (******************************************************************************)
692
693 (* This file defines several types and module types that are used in the
694 specification of module [Engine]. *)
695
696 (* --------------------------------------------------------------------------- *)
697
698 (* It would be nice if we could keep the structure of stacks and environments
699 hidden. However, stacks and environments must be accessible to semantic
700 actions, so the following data structure definitions must be public. *)
701
702 (* --------------------------------------------------------------------------- *)
703
704 (* A stack is a linked list of cells. A sentinel cell -- which is its own
705 successor -- is used to mark the bottom of the stack. The sentinel cell
706 itself is not significant -- it contains dummy values. *)
707
708 type ('state, 'semantic_value) stack = {
709
710 (* The state that we should go back to if we pop this stack cell. *)
711
712 (* This convention means that the state contained in the top stack cell is
713 not the current state [env.current]. It also means that the state found
714 within the sentinel is a dummy -- it is never consulted. This convention
715 is the same as that adopted by the code-based back-end. *)
716
717 state: 'state;
718
719 (* The semantic value associated with the chunk of input that this cell
720 represents. *)
721
722 semv: 'semantic_value;
723
724 (* The start and end positions of the chunk of input that this cell
725 represents. *)
726
727 startp: Lexing.position;
728 endp: Lexing.position;
729
730 (* The next cell down in the stack. If this is a self-pointer, then this
731 cell is the sentinel, and the stack is conceptually empty. *)
732
733 next: ('state, 'semantic_value) stack;
734
735 }
736
737 (* --------------------------------------------------------------------------- *)
738
739 (* A parsing environment contains all of the parser's state (except for the
740 current program point). *)
741
742 type ('state, 'semantic_value, 'token) env = {
743
744 (* If this flag is true, then the first component of [env.triple] should
745 be ignored, as it has been logically overwritten with the [error]
746 pseudo-token. *)
747
748 error: bool;
749
750 (* The last token that was obtained from the lexer, together with its start
751 and end positions. Warning: before the first call to the lexer has taken
752 place, a dummy (and possibly invalid) token is stored here. *)
753
754 triple: 'token * Lexing.position * Lexing.position;
755
756 (* The stack. In [CodeBackend], it is passed around on its own,
757 whereas, here, it is accessed via the environment. *)
758
759 stack: ('state, 'semantic_value) stack;
760
761 (* The current state. In [CodeBackend], it is passed around on its
762 own, whereas, here, it is accessed via the environment. *)
763
764 current: 'state;
765
766 }
767
768 (* --------------------------------------------------------------------------- *)
769
770 (* This signature describes the parameters that must be supplied to the LR
771 engine. *)
772
773 module type TABLE = sig
774
775 (* The type of automaton states. *)
776
777 type state
778
779 (* States are numbered. *)
780
781 val number: state -> int
782
783 (* The type of tokens. These can be thought of as real tokens, that is,
784 tokens returned by the lexer. They carry a semantic value. This type
785 does not include the [error] pseudo-token. *)
786
787 type token
788
789 (* The type of terminal symbols. These can be thought of as integer codes.
790 They do not carry a semantic value. This type does include the [error]
791 pseudo-token. *)
792
793 type terminal
794
795 (* The type of nonterminal symbols. *)
796
797 type nonterminal
798
799 (* The type of semantic values. *)
800
801 type semantic_value
802
803 (* A token is conceptually a pair of a (non-[error]) terminal symbol and
804 a semantic value. The following two functions are the pair projections. *)
805
806 val token2terminal: token -> terminal
807 val token2value: token -> semantic_value
808
809 (* Even though the [error] pseudo-token is not a real token, it is a
810 terminal symbol. Furthermore, for regularity, it must have a semantic
811 value. *)
812
813 val error_terminal: terminal
814 val error_value: semantic_value
815
816 (* [foreach_terminal] allows iterating over all terminal symbols. *)
817
818 val foreach_terminal: (terminal -> 'a -> 'a) -> 'a -> 'a
819
820 (* The type of productions. *)
821
822 type production
823
824 val production_index: production -> int
825 val find_production: int -> production
826
827 (* If a state [s] has a default reduction on production [prod], then, upon
828 entering [s], the automaton should reduce [prod] without consulting the
829 lookahead token. The following function allows determining which states
830 have default reductions. *)
831
832 (* Instead of returning a value of a sum type -- either [DefRed prod], or
833 [NoDefRed] -- it accepts two continuations, and invokes just one of
834 them. This mechanism allows avoiding a memory allocation. *)
835
836 val default_reduction:
837 state ->
838 ('env -> production -> 'answer) ->
839 ('env -> 'answer) ->
840 'env -> 'answer
841
842 (* An LR automaton can normally take three kinds of actions: shift, reduce,
843 or fail. (Acceptance is a particular case of reduction: it consists in
844 reducing a start production.) *)
845
846 (* There are two variants of the shift action. [shift/discard s] instructs
847 the automaton to discard the current token, request a new one from the
848 lexer, and move to state [s]. [shift/nodiscard s] instructs it to move to
849 state [s] without requesting a new token. This instruction should be used
850 when [s] has a default reduction on [#]. See [CodeBackend.gettoken] for
851 details. *)
852
853 (* This is the automaton's action table. It maps a pair of a state and a
854 terminal symbol to an action. *)
855
856 (* Instead of returning a value of a sum type -- one of shift/discard,
857 shift/nodiscard, reduce, or fail -- this function accepts three
858 continuations, and invokes just one them. This mechanism allows avoiding
859 a memory allocation. *)
860
861 (* In summary, the parameters to [action] are as follows:
862
863 - the first two parameters, a state and a terminal symbol, are used to
864 look up the action table;
865
866 - the next parameter is the semantic value associated with the above
867 terminal symbol; it is not used, only passed along to the shift
868 continuation, as explained below;
869
870 - the shift continuation expects an environment; a flag that tells
871 whether to discard the current token; the terminal symbol that
872 is being shifted; its semantic value; and the target state of
873 the transition;
874
875 - the reduce continuation expects an environment and a production;
876
877 - the fail continuation expects an environment;
878
879 - the last parameter is the environment; it is not used, only passed
880 along to the selected continuation. *)
881
882 val action:
883 state ->
884 terminal ->
885 semantic_value ->
886 ('env -> bool -> terminal -> semantic_value -> state -> 'answer) ->
887 ('env -> production -> 'answer) ->
888 ('env -> 'answer) ->
889 'env -> 'answer
890
891 (* This is the automaton's goto table. This table maps a pair of a state
892 and a nonterminal symbol to a new state. By extension, it also maps a
893 pair of a state and a production to a new state. *)
894
895 (* The function [goto_nt] can be applied to [s] and [nt] ONLY if the state
896 [s] has an outgoing transition labeled [nt]. Otherwise, its result is
897 undefined. Similarly, the call [goto_prod prod s] is permitted ONLY if
898 the state [s] has an outgoing transition labeled with the nonterminal
899 symbol [lhs prod]. The function [maybe_goto_nt] involves an additional
900 dynamic check and CAN be called even if there is no outgoing transition. *)
901
902 val goto_nt : state -> nonterminal -> state
903 val goto_prod: state -> production -> state
904 val maybe_goto_nt: state -> nonterminal -> state option
905
906 (* [is_start prod] tells whether the production [prod] is a start production. *)
907
908 val is_start: production -> bool
909
910 (* By convention, a semantic action is responsible for:
911
912 1. fetching whatever semantic values and positions it needs off the stack;
913
914 2. popping an appropriate number of cells off the stack, as dictated
915 by the length of the right-hand side of the production;
916
917 3. computing a new semantic value, as well as new start and end positions;
918
919 4. pushing a new stack cell, which contains the three values
920 computed in step 3;
921
922 5. returning the new stack computed in steps 2 and 4.
923
924 Point 1 is essentially forced upon us: if semantic values were fetched
925 off the stack by this interpreter, then the calling convention for
926 semantic actions would be variadic: not all semantic actions would have
927 the same number of arguments. The rest follows rather naturally. *)
928
929 (* Semantic actions are allowed to raise [Error]. *)
930
931 exception Error
932
933 type semantic_action =
934 (state, semantic_value, token) env -> (state, semantic_value) stack
935
936 val semantic_action: production -> semantic_action
937
938 (* [may_reduce state prod] tests whether the state [state] is capable of
939 reducing the production [prod]. This function is currently costly and
940 is not used by the core LR engine. It is used in the implementation
941 of certain functions, such as [force_reduction], which allow the engine
942 to be driven programmatically. *)
943
944 val may_reduce: state -> production -> bool
945
946 (* The LR engine requires a number of hooks, which are used for logging. *)
947
948 (* The comments below indicate the conventional messages that correspond
949 to these hooks in the code-based back-end; see [CodeBackend]. *)
950
951 (* If the flag [log] is false, then the logging functions are not called.
952 If it is [true], then they are called. *)
953
954 val log : bool
955
956 module Log : sig
957
958 (* State %d: *)
959
960 val state: state -> unit
961
962 (* Shifting (<terminal>) to state <state> *)
963
964 val shift: terminal -> state -> unit
965
966 (* Reducing a production should be logged either as a reduction
967 event (for regular productions) or as an acceptance event (for
968 start productions). *)
969
970 (* Reducing production <production> / Accepting *)
971
972 val reduce_or_accept: production -> unit
973
974 (* Lookahead token is now <terminal> (<pos>-<pos>) *)
975
976 val lookahead_token: terminal -> Lexing.position -> Lexing.position -> unit
977
978 (* Initiating error handling *)
979
980 val initiating_error_handling: unit -> unit
981
982 (* Resuming error handling *)
983
984 val resuming_error_handling: unit -> unit
985
986 (* Handling error in state <state> *)
987
988 val handling_error: state -> unit
989
990 end
991
992 end
993
994 (* --------------------------------------------------------------------------- *)
995
996 (* This signature describes the monolithic (traditional) LR engine. *)
997
998 (* In this interface, the parser controls the lexer. *)
999
1000 module type MONOLITHIC_ENGINE = sig
1001
1002 type state
1003
1004 type token
1005
1006 type semantic_value
1007
1008 (* An entry point to the engine requires a start state, a lexer, and a lexing
1009 buffer. It either succeeds and produces a semantic value, or fails and
1010 raises [Error]. *)
1011
1012 exception Error
1013
1014 val entry:
1015 state ->
1016 (Lexing.lexbuf -> token) ->
1017 Lexing.lexbuf ->
1018 semantic_value
1019
1020 end
1021
1022 (* --------------------------------------------------------------------------- *)
1023
1024 (* The following signatures describe the incremental LR engine. *)
1025
1026 (* First, see [INCREMENTAL_ENGINE] in the file [IncrementalEngine.ml]. *)
1027
1028 (* The [start] function is set apart because we do not wish to publish
1029 it as part of the generated [parser.mli] file. Instead, the table
1030 back-end will publish specialized versions of it, with a suitable
1031 type cast. *)
1032
1033 module type INCREMENTAL_ENGINE_START = sig
1034
1035 (* [start] is an entry point. It requires a start state and a start position
1036 and begins the parsing process. If the lexer is based on an OCaml lexing
1037 buffer, the start position should be [lexbuf.lex_curr_p]. [start] produces
1038 a checkpoint, which usually will be an [InputNeeded] checkpoint. (It could
1039 be [Accepted] if this starting state accepts only the empty word. It could
1040 be [Rejected] if this starting state accepts no word at all.) It does not
1041 raise any exception. *)
1042
1043 (* [start s pos] should really produce a checkpoint of type ['a checkpoint],
1044 for a fixed ['a] that depends on the state [s]. We cannot express this, so
1045 we use [semantic_value checkpoint], which is safe. The table back-end uses
1046 [Obj.magic] to produce safe specialized versions of [start]. *)
1047
1048 type state
1049 type semantic_value
1050 type 'a checkpoint
1051
1052 val start:
1053 state ->
1054 Lexing.position ->
1055 semantic_value checkpoint
1056
1057 end
1058
1059 (* --------------------------------------------------------------------------- *)
1060
1061 (* This signature describes the LR engine, which combines the monolithic
1062 and incremental interfaces. *)
1063
1064 module type ENGINE = sig
1065
1066 include MONOLITHIC_ENGINE
1067
1068 include IncrementalEngine.INCREMENTAL_ENGINE
1069 with type token := token
1070 and type 'a lr1state = state (* useful for us; hidden from the end user *)
1071
1072 include INCREMENTAL_ENGINE_START
1073 with type state := state
1074 and type semantic_value := semantic_value
1075 and type 'a checkpoint := 'a checkpoint
1076
1077 end
1078 end
1079 module Engine = struct
1080 (******************************************************************************)
1081 (* *)
1082 (* Menhir *)
1083 (* *)
1084 (* François Pottier, Inria Paris *)
1085 (* Yann Régis-Gianas, PPS, Université Paris Diderot *)
1086 (* *)
1087 (* Copyright Inria. All rights reserved. This file is distributed under the *)
1088 (* terms of the GNU Library General Public License version 2, with a *)
1089 (* special exception on linking, as described in the file LICENSE. *)
1090 (* *)
1091 (******************************************************************************)
1092
1093 type position = Lexing.position
1094 open EngineTypes
1095
1096 (* The LR parsing engine. *)
1097
1098 (* This module is used:
1099
1100 - at compile time, if so requested by the user, via the --interpret options;
1101 - at run time, in the table-based back-end. *)
1102
1103 module Make (T : TABLE) = struct
1104
1105 (* This propagates type and exception definitions. The functions [number],
1106 [production_index], [find_production], too, are defined by this [include]
1107 declaration. *)
1108
1109 include T
1110
1111 type 'a env =
1112 (state, semantic_value, token) EngineTypes.env
1113
1114 (* ------------------------------------------------------------------------ *)
1115
1116 (* The type [checkpoint] represents an intermediate or final result of the
1117 parser. See [EngineTypes]. *)
1118
1119 (* The type [checkpoint] is presented to the user as a private type (see
1120 [IncrementalEngine]). This prevents the user from manufacturing
1121 checkpoints (i.e., continuations) that do not make sense. (Such
1122 continuations could potentially violate the LR invariant and lead to
1123 crashes.) *)
1124
1125 (* 2017/03/29 Although [checkpoint] is a private type, we now expose a
1126 constructor function, [input_needed]. This function allows manufacturing
1127 a checkpoint out of an environment. For this reason, the type [env] must
1128 also be parameterized with ['a]. *)
1129
1130 type 'a checkpoint =
1131 | InputNeeded of 'a env
1132 | Shifting of 'a env * 'a env * bool
1133 | AboutToReduce of 'a env * production
1134 | HandlingError of 'a env
1135 | Accepted of 'a
1136 | Rejected
1137
1138 (* ------------------------------------------------------------------------ *)
1139
1140 (* In the code-based back-end, the [run] function is sometimes responsible
1141 for pushing a new cell on the stack. This is motivated by code sharing
1142 concerns. In this interpreter, there is no such concern; [run]'s caller
1143 is always responsible for updating the stack. *)
1144
1145 (* In the code-based back-end, there is a [run] function for each state
1146 [s]. This function can behave in two slightly different ways, depending
1147 on when it is invoked, or (equivalently) depending on [s].
1148
1149 If [run] is invoked after shifting a terminal symbol (or, equivalently,
1150 if [s] has a terminal incoming symbol), then [run] discards a token,
1151 unless [s] has a default reduction on [#]. (Indeed, in that case,
1152 requesting the next token might drive the lexer off the end of the input
1153 stream.)
1154
1155 If, on the other hand, [run] is invoked after performing a goto
1156 transition, or invoked directly by an entry point, then there is nothing
1157 to discard.
1158
1159 These two cases are reflected in [CodeBackend.gettoken].
1160
1161 Here, the code is structured in a slightly different way. It is up to the
1162 caller of [run] to indicate whether to discard a token, via the parameter
1163 [please_discard]. This flag is set when [s] is being entered by shifting
1164 a terminal symbol and [s] does not have a default reduction on [#]. *)
1165
1166 (* The following recursive group of functions are tail recursive, produce a
1167 checkpoint of type [semantic_value checkpoint], and cannot raise an
1168 exception. A semantic action can raise [Error], but this exception is
1169 immediately caught within [reduce]. *)
1170
1171 let rec run env please_discard : semantic_value checkpoint =
1172
1173 (* Log the fact that we just entered this state. *)
1174
1175 if log then
1176 Log.state env.current;
1177
1178 (* If [please_discard] is set, we discard the current lookahead token and
1179 fetch the next one. In order to request a token from the user, we
1180 return an [InputNeeded] continuation, which, when invoked by the user,
1181 will take us to [discard]. If [please_discard] is not set, we skip this
1182 step and jump directly to [check_for_default_reduction]. *)
1183
1184 if please_discard then
1185 InputNeeded env
1186 else
1187 check_for_default_reduction env
1188
1189 (* [discard env triple] stores [triple] into [env], overwriting the previous
1190 token. It is invoked by [offer], which itself is invoked by the user in
1191 response to an [InputNeeded] checkpoint. *)
1192
1193 and discard env triple =
1194 if log then begin
1195 let (token, startp, endp) = triple in
1196 Log.lookahead_token (T.token2terminal token) startp endp
1197 end;
1198 let env = { env with error = false; triple } in
1199 check_for_default_reduction env
1200
1201 and check_for_default_reduction env =
1202
1203 (* Examine what situation we are in. This case analysis is analogous to
1204 that performed in [CodeBackend.gettoken], in the sub-case where we do
1205 not have a terminal incoming symbol. *)
1206
1207 T.default_reduction
1208 env.current
1209 announce_reduce (* there is a default reduction; perform it *)
1210 check_for_error_token (* there is none; continue below *)
1211 env
1212
1213 and check_for_error_token env =
1214
1215 (* There is no default reduction. Consult the current lookahead token
1216 so as to determine which action should be taken. *)
1217
1218 (* Peeking at the first input token, without taking it off the input
1219 stream, is done by reading [env.triple]. We are careful to first
1220 check [env.error]. *)
1221
1222 (* Note that, if [please_discard] was true, then we have just called
1223 [discard], so the lookahead token cannot be [error]. *)
1224
1225 (* Returning [HandlingError env] is equivalent to calling [error env]
1226 directly, except it allows the user to regain control. *)
1227
1228 if env.error then begin
1229 if log then
1230 Log.resuming_error_handling();
1231 HandlingError env
1232 end
1233 else
1234 let (token, _, _) = env.triple in
1235
1236 (* We consult the two-dimensional action table, indexed by the
1237 current state and the current lookahead token, in order to
1238 determine which action should be taken. *)
1239
1240 T.action
1241 env.current (* determines a row *)
1242 (T.token2terminal token) (* determines a column *)
1243 (T.token2value token)
1244 shift (* shift continuation *)
1245 announce_reduce (* reduce continuation *)
1246 initiate (* failure continuation *)
1247 env
1248
1249 (* ------------------------------------------------------------------------ *)
1250
1251 (* This function takes care of shift transitions along a terminal symbol.
1252 (Goto transitions are taken care of within [reduce] below.) The symbol
1253 can be either an actual token or the [error] pseudo-token. *)
1254
1255 (* Here, the lookahead token CAN be [error]. *)
1256
1257 and shift env
1258 (please_discard : bool)
1259 (terminal : terminal)
1260 (value : semantic_value)
1261 (s' : state) =
1262
1263 (* Log the transition. *)
1264
1265 if log then
1266 Log.shift terminal s';
1267
1268 (* Push a new cell onto the stack, containing the identity of the
1269 state that we are leaving. *)
1270
1271 let (_, startp, endp) = env.triple in
1272 let stack = {
1273 state = env.current;
1274 semv = value;
1275 startp;
1276 endp;
1277 next = env.stack;
1278 } in
1279
1280 (* Switch to state [s']. *)
1281
1282 let new_env = { env with stack; current = s' } in
1283
1284 (* Expose the transition to the user. (In principle, we have a choice
1285 between exposing the transition before we take it, after we take
1286 it, or at some point in between. This affects the number and type
1287 of the parameters carried by [Shifting]. Here, we choose to expose
1288 the transition after we take it; this allows [Shifting] to carry
1289 only three parameters, whose meaning is simple.) *)
1290
1291 Shifting (env, new_env, please_discard)
1292
1293 (* ------------------------------------------------------------------------ *)
1294
1295 (* The function [announce_reduce] stops the parser and returns a checkpoint
1296 which allows the parser to be resumed by calling [reduce]. *)
1297
1298 (* Only ordinary productions are exposed to the user. Start productions
1299 are not exposed to the user. Reducing a start production simply leads
1300 to the successful termination of the parser. *)
1301
1302 and announce_reduce env (prod : production) =
1303 if T.is_start prod then
1304 accept env prod
1305 else
1306 AboutToReduce (env, prod)
1307
1308 (* The function [reduce] takes care of reductions. It is invoked by
1309 [resume] after an [AboutToReduce] event has been produced. *)
1310
1311 (* Here, the lookahead token CAN be [error]. *)
1312
1313 (* The production [prod] CANNOT be a start production. *)
1314
1315 and reduce env (prod : production) =
1316
1317 (* Log a reduction event. *)
1318
1319 if log then
1320 Log.reduce_or_accept prod;
1321
1322 (* Invoke the semantic action. The semantic action is responsible for
1323 truncating the stack and pushing a new cell onto the stack, which
1324 contains a new semantic value. It can raise [Error]. *)
1325
1326 (* If the semantic action terminates normally, it returns a new stack,
1327 which becomes the current stack. *)
1328
1329 (* If the semantic action raises [Error], we catch it and initiate error
1330 handling. *)
1331
1332 (* This [match/with/exception] construct requires OCaml 4.02. *)
1333
1334 match T.semantic_action prod env with
1335 | stack ->
1336
1337 (* By our convention, the semantic action has produced an updated
1338 stack. The state now found in the top stack cell is the return
1339 state. *)
1340
1341 (* Perform a goto transition. The target state is determined
1342 by consulting the goto table at the return state and at
1343 production [prod]. *)
1344
1345 let current = T.goto_prod stack.state prod in
1346 let env = { env with stack; current } in
1347 run env false
1348
1349 | exception Error ->
1350 initiate env
1351
1352 and accept env prod =
1353 (* Log an accept event. *)
1354 if log then
1355 Log.reduce_or_accept prod;
1356 (* Extract the semantic value out of the stack. *)
1357 let v = env.stack.semv in
1358 (* Finish. *)
1359 Accepted v
1360
1361 (* ------------------------------------------------------------------------ *)
1362
1363 (* The following functions deal with errors. *)
1364
1365 (* [initiate] initiates or resumes error handling. *)
1366
1367 (* Here, the lookahead token CAN be [error]. *)
1368
1369 and initiate env =
1370 if log then
1371 Log.initiating_error_handling();
1372 let env = { env with error = true } in
1373 HandlingError env
1374
1375 (* [error] handles errors. *)
1376
1377 and error env =
1378 assert env.error;
1379
1380 (* Consult the column associated with the [error] pseudo-token in the
1381 action table. *)
1382
1383 T.action
1384 env.current (* determines a row *)
1385 T.error_terminal (* determines a column *)
1386 T.error_value
1387 error_shift (* shift continuation *)
1388 error_reduce (* reduce continuation *)
1389 error_fail (* failure continuation *)
1390 env
1391
1392 and error_shift env please_discard terminal value s' =
1393
1394 (* Here, [terminal] is [T.error_terminal],
1395 and [value] is [T.error_value]. *)
1396
1397 assert (terminal = T.error_terminal && value = T.error_value);
1398
1399 (* This state is capable of shifting the [error] token. *)
1400
1401 if log then
1402 Log.handling_error env.current;
1403 shift env please_discard terminal value s'
1404
1405 and error_reduce env prod =
1406
1407 (* This state is capable of performing a reduction on [error]. *)
1408
1409 if log then
1410 Log.handling_error env.current;
1411 reduce env prod
1412 (* Intentionally calling [reduce] instead of [announce_reduce].
1413 It does not seem very useful, and it could be confusing, to
1414 expose the reduction steps taken during error handling. *)
1415
1416 and error_fail env =
1417
1418 (* This state is unable to handle errors. Attempt to pop a stack
1419 cell. *)
1420
1421 let cell = env.stack in
1422 let next = cell.next in
1423 if next == cell then
1424
1425 (* The stack is empty. Die. *)
1426
1427 Rejected
1428
1429 else begin
1430
1431 (* The stack is nonempty. Pop a cell, updating the current state
1432 with that found in the popped cell, and try again. *)
1433
1434 let env = { env with
1435 stack = next;
1436 current = cell.state
1437 } in
1438 HandlingError env
1439
1440 end
1441
1442 (* End of the nest of tail recursive functions. *)
1443
1444 (* ------------------------------------------------------------------------ *)
1445 (* ------------------------------------------------------------------------ *)
1446
1447 (* The incremental interface. See [EngineTypes]. *)
1448
1449 (* [start s] begins the parsing process. *)
1450
1451 let start (s : state) (initial : position) : semantic_value checkpoint =
1452
1453 (* Build an empty stack. This is a dummy cell, which is its own successor.
1454 Its [next] field WILL be accessed by [error_fail] if an error occurs and
1455 is propagated all the way until the stack is empty. Its [endp] field WILL
1456 be accessed (by a semantic action) if an epsilon production is reduced
1457 when the stack is empty. *)
1458
1459 let rec empty = {
1460 state = s; (* dummy *)
1461 semv = T.error_value; (* dummy *)
1462 startp = initial; (* dummy *)
1463 endp = initial;
1464 next = empty;
1465 } in
1466
1467 (* Build an initial environment. *)
1468
1469 (* Unfortunately, there is no type-safe way of constructing a
1470 dummy token. Tokens carry semantic values, which in general
1471 we cannot manufacture. This instance of [Obj.magic] could
1472 be avoided by adopting a different representation (e.g., no
1473 [env.error] field, and an option in the first component of
1474 [env.triple]), but I like this representation better. *)
1475
1476 let dummy_token = Obj.magic () in
1477 let env = {
1478 error = false;
1479 triple = (dummy_token, initial, initial); (* dummy *)
1480 stack = empty;
1481 current = s;
1482 } in
1483
1484 (* Begin parsing. *)
1485
1486 (* The parameter [please_discard] here is [true], which means we know
1487 that we must read at least one token. This claim relies on the fact
1488 that we have ruled out the two special cases where a start symbol
1489 recognizes the empty language or the singleton language {epsilon}. *)
1490
1491 run env true
1492
1493 (* [offer checkpoint triple] is invoked by the user in response to a
1494 checkpoint of the form [InputNeeded env]. It checks that [checkpoint] is
1495 indeed of this form, and invokes [discard]. *)
1496
1497 (* [resume checkpoint] is invoked by the user in response to a checkpoint of
1498 the form [AboutToReduce (env, prod)] or [HandlingError env]. It checks
1499 that [checkpoint] is indeed of this form, and invokes [reduce] or
1500 [error], as appropriate. *)
1501
1502 (* In reality, [offer] and [resume] accept an argument of type
1503 [semantic_value checkpoint] and produce a checkpoint of the same type.
1504 The choice of [semantic_value] is forced by the fact that this is the
1505 parameter of the checkpoint [Accepted]. *)
1506
1507 (* We change this as follows. *)
1508
1509 (* We change the argument and result type of [offer] and [resume] from
1510 [semantic_value checkpoint] to ['a checkpoint]. This is safe, in this
1511 case, because we give the user access to values of type [t checkpoint]
1512 only if [t] is indeed the type of the eventual semantic value for this
1513 run. (More precisely, by examining the signatures [INCREMENTAL_ENGINE]
1514 and [INCREMENTAL_ENGINE_START], one finds that the user can build a value
1515 of type ['a checkpoint] only if ['a] is [semantic_value]. The table
1516 back-end goes further than this and produces versions of [start] composed
1517 with a suitable cast, which give the user access to a value of type
1518 [t checkpoint] where [t] is the type of the start symbol.) *)
1519
1520 let offer : 'a . 'a checkpoint ->
1521 token * position * position ->
1522 'a checkpoint
1523 = function
1524 | InputNeeded env ->
1525 Obj.magic discard env
1526 | _ ->
1527 invalid_arg "offer expects InputNeeded"
1528
1529 let resume : 'a . 'a checkpoint -> 'a checkpoint = function
1530 | HandlingError env ->
1531 Obj.magic error env
1532 | Shifting (_, env, please_discard) ->
1533 Obj.magic run env please_discard
1534 | AboutToReduce (env, prod) ->
1535 Obj.magic reduce env prod
1536 | _ ->
1537 invalid_arg "resume expects HandlingError | Shifting | AboutToReduce"
1538
1539 (* ------------------------------------------------------------------------ *)
1540 (* ------------------------------------------------------------------------ *)
1541
1542 (* The traditional interface. See [EngineTypes]. *)
1543
1544 (* ------------------------------------------------------------------------ *)
1545
1546 (* Wrapping a lexer and lexbuf as a token supplier. *)
1547
1548 type supplier =
1549 unit -> token * position * position
1550
1551 let lexer_lexbuf_to_supplier
1552 (lexer : Lexing.lexbuf -> token)
1553 (lexbuf : Lexing.lexbuf)
1554 : supplier =
1555 fun () ->
1556 let token = lexer lexbuf in
1557 let startp = lexbuf.Lexing.lex_start_p
1558 and endp = lexbuf.Lexing.lex_curr_p in
1559 token, startp, endp
1560
1561 (* ------------------------------------------------------------------------ *)
1562
1563 (* The main loop repeatedly handles intermediate checkpoints, until a final
1564 checkpoint is obtained. This allows implementing the monolithic interface
1565 ([entry]) in terms of the incremental interface ([start], [offer],
1566 [handle], [reduce]). *)
1567
1568 (* By convention, acceptance is reported by returning a semantic value,
1569 whereas rejection is reported by raising [Error]. *)
1570
1571 (* [loop] is polymorphic in ['a]. No cheating is involved in achieving this.
1572 All of the cheating resides in the types assigned to [offer] and [handle]
1573 above. *)
1574
1575 let rec loop : 'a . supplier -> 'a checkpoint -> 'a =
1576 fun read checkpoint ->
1577 match checkpoint with
1578 | InputNeeded _ ->
1579 (* The parser needs a token. Request one from the lexer,
1580 and offer it to the parser, which will produce a new
1581 checkpoint. Then, repeat. *)
1582 let triple = read() in
1583 let checkpoint = offer checkpoint triple in
1584 loop read checkpoint
1585 | Shifting _
1586 | AboutToReduce _
1587 | HandlingError _ ->
1588 (* The parser has suspended itself, but does not need
1589 new input. Just resume the parser. Then, repeat. *)
1590 let checkpoint = resume checkpoint in
1591 loop read checkpoint
1592 | Accepted v ->
1593 (* The parser has succeeded and produced a semantic value.
1594 Return this semantic value to the user. *)
1595 v
1596 | Rejected ->
1597 (* The parser rejects this input. Raise an exception. *)
1598 raise Error
1599
1600 let entry (s : state) lexer lexbuf : semantic_value =
1601 let initial = lexbuf.Lexing.lex_curr_p in
1602 loop (lexer_lexbuf_to_supplier lexer lexbuf) (start s initial)
1603
1604 (* ------------------------------------------------------------------------ *)
1605
1606 (* [loop_handle] stops if it encounters an error, and at this point, invokes
1607 its failure continuation, without letting Menhir do its own traditional
1608 error-handling (which involves popping the stack, etc.). *)
1609
1610 let rec loop_handle succeed fail read checkpoint =
1611 match checkpoint with
1612 | InputNeeded _ ->
1613 let triple = read() in
1614 let checkpoint = offer checkpoint triple in
1615 loop_handle succeed fail read checkpoint
1616 | Shifting _
1617 | AboutToReduce _ ->
1618 let checkpoint = resume checkpoint in
1619 loop_handle succeed fail read checkpoint
1620 | HandlingError _
1621 | Rejected ->
1622 (* The parser has detected an error. Invoke the failure continuation. *)
1623 fail checkpoint
1624 | Accepted v ->
1625 (* The parser has succeeded and produced a semantic value. Invoke the
1626 success continuation. *)
1627 succeed v
1628
1629 (* ------------------------------------------------------------------------ *)
1630
1631 (* [loop_handle_undo] is analogous to [loop_handle], except it passes a pair
1632 of checkpoints to the failure continuation.
1633
1634 The first (and oldest) checkpoint is the last [InputNeeded] checkpoint
1635 that was encountered before the error was detected. The second (and
1636 newest) checkpoint is where the error was detected, as in [loop_handle].
1637 Going back to the first checkpoint can be thought of as undoing any
1638 reductions that were performed after seeing the problematic token. (These
1639 reductions must be default reductions or spurious reductions.) *)
1640
1641 let rec loop_handle_undo succeed fail read (inputneeded, checkpoint) =
1642 match checkpoint with
1643 | InputNeeded _ ->
1644 (* Update the last recorded [InputNeeded] checkpoint. *)
1645 let inputneeded = checkpoint in
1646 let triple = read() in
1647 let checkpoint = offer checkpoint triple in
1648 loop_handle_undo succeed fail read (inputneeded, checkpoint)
1649 | Shifting _
1650 | AboutToReduce _ ->
1651 let checkpoint = resume checkpoint in
1652 loop_handle_undo succeed fail read (inputneeded, checkpoint)
1653 | HandlingError _
1654 | Rejected ->
1655 fail inputneeded checkpoint
1656 | Accepted v ->
1657 succeed v
1658
1659 (* For simplicity, we publish a version of [loop_handle_undo] that takes a
1660 single checkpoint as an argument, instead of a pair of checkpoints. We
1661 check that the argument is [InputNeeded _], and duplicate it. *)
1662
1663 (* The parser cannot accept or reject before it asks for the very first
1664 character of input. (Indeed, we statically reject a symbol that
1665 generates the empty language or the singleton language {epsilon}.)
1666 So, the [start] checkpoint must match [InputNeeded _]. Hence, it is
1667 permitted to call [loop_handle_undo] with a [start] checkpoint. *)
1668
1669 let loop_handle_undo succeed fail read checkpoint =
1670 assert (match checkpoint with InputNeeded _ -> true | _ -> false);
1671 loop_handle_undo succeed fail read (checkpoint, checkpoint)
1672
1673 (* ------------------------------------------------------------------------ *)
1674
1675 let rec shifts checkpoint =
1676 match checkpoint with
1677 | Shifting (env, _, _) ->
1678 (* The parser is about to shift, which means it is willing to
1679 consume the terminal symbol that we have fed it. Return the
1680 state just before this transition. *)
1681 Some env
1682 | AboutToReduce _ ->
1683 (* The parser wishes to reduce. Just follow. *)
1684 shifts (resume checkpoint)
1685 | HandlingError _ ->
1686 (* The parser fails, which means it rejects the terminal symbol
1687 that we have fed it. *)
1688 None
1689 | InputNeeded _
1690 | Accepted _
1691 | Rejected ->
1692 (* None of these cases can arise. Indeed, after a token is submitted
1693 to it, the parser must shift, reduce, or signal an error, before
1694 it can request another token or terminate. *)
1695 assert false
1696
1697 let acceptable checkpoint token pos =
1698 let triple = (token, pos, pos) in
1699 let checkpoint = offer checkpoint triple in
1700 match shifts checkpoint with
1701 | None -> false
1702 | Some _env -> true
1703
1704 (* ------------------------------------------------------------------------ *)
1705
1706 (* The type ['a lr1state] describes the (non-initial) states of the LR(1)
1707 automaton. The index ['a] represents the type of the semantic value
1708 associated with the state's incoming symbol. *)
1709
1710 (* The type ['a lr1state] is defined as an alias for [state], which itself
1711 is usually defined as [int] (see [TableInterpreter]). So, ['a lr1state]
1712 is technically a phantom type, but should really be thought of as a GADT
1713 whose data constructors happen to be represented as integers. It is
1714 presented to the user as an abstract type (see [IncrementalEngine]). *)
1715
1716 type 'a lr1state =
1717 state
1718
1719 (* ------------------------------------------------------------------------ *)
1720
1721 (* Stack inspection. *)
1722
1723 (* We offer a read-only view of the parser's state as a stream of elements.
1724 Each element contains a pair of a (non-initial) state and a semantic
1725 value associated with (the incoming symbol of) this state. Note that the
1726 type [element] is an existential type. *)
1727
1728 (* As of 2017/03/31, the type [stack] and the function [stack] are DEPRECATED.
1729 If desired, they could now be implemented outside Menhir, by relying on
1730 the functions [top] and [pop]. *)
1731
1732 type element =
1733 | Element: 'a lr1state * 'a * position * position -> element
1734
1735 open General
1736
1737 type stack =
1738 element stream
1739
1740 (* If [current] is the current state and [cell] is the top stack cell,
1741 then [stack cell current] is a view of the parser's state as a stream
1742 of elements. *)
1743
1744 let rec stack cell current : element stream =
1745 lazy (
1746 (* The stack is empty iff the top stack cell is its own successor. In
1747 that case, the current state [current] should be an initial state
1748 (which has no incoming symbol).
1749 We do not allow the user to inspect this state. *)
1750 let next = cell.next in
1751 if next == cell then
1752 Nil
1753 else
1754 (* Construct an element containing the current state [current] as well
1755 as the semantic value contained in the top stack cell. This semantic
1756 value is associated with the incoming symbol of this state, so it
1757 makes sense to pair them together. The state has type ['a state] and
1758 the semantic value has type ['a], for some type ['a]. Here, the OCaml
1759 type-checker thinks ['a] is [semantic_value] and considers this code
1760 well-typed. Outside, we will use magic to provide the user with a way
1761 of inspecting states and recovering the value of ['a]. *)
1762 let element = Element (
1763 current,
1764 cell.semv,
1765 cell.startp,
1766 cell.endp
1767 ) in
1768 Cons (element, stack next cell.state)
1769 )
1770
1771 let stack env : element stream =
1772 stack env.stack env.current
1773
1774 (* As explained above, the function [top] allows access to the top stack
1775 element only if the stack is nonempty, i.e., only if the current state
1776 is not an initial state. *)
1777
1778 let top env : element option =
1779 let cell = env.stack in
1780 let next = cell.next in
1781 if next == cell then
1782 None
1783 else
1784 Some (Element (env.current, cell.semv, cell.startp, cell.endp))
1785
1786 (* [equal] compares the stacks for physical equality, and compares the
1787 current states via their numbers (this seems cleaner than using OCaml's
1788 polymorphic equality). *)
1789
1790 (* The two fields that are not compared by [equal], namely [error] and
1791 [triple], are overwritten by the function [discard], which handles
1792 [InputNeeded] checkpoints. Thus, if [equal env1 env2] holds, then the
1793 checkpoints [input_needed env1] and [input_needed env2] are
1794 equivalent: they lead the parser to behave in the same way. *)
1795
1796 let equal env1 env2 =
1797 env1.stack == env2.stack &&
1798 number env1.current = number env2.current
1799
1800 let current_state_number env =
1801 number env.current
1802
1803 (* ------------------------------------------------------------------------ *)
1804
1805 (* Access to the position of the lookahead token. *)
1806
1807 let positions { triple = (_, startp, endp); _ } =
1808 startp, endp
1809
1810 (* ------------------------------------------------------------------------ *)
1811
1812 (* Access to information about default reductions. *)
1813
1814 (* This can be a function of states, or a function of environments.
1815 We offer both. *)
1816
1817 (* Instead of a Boolean result, we could return a [production option].
1818 However, we would have to explicitly test whether [prod] is a start
1819 production, and in that case, return [None], I suppose. Indeed, we
1820 have decided not to expose the start productions. *)
1821
1822 let state_has_default_reduction (state : _ lr1state) : bool =
1823 T.default_reduction state
1824 (fun _env _prod -> true)
1825 (fun _env -> false)
1826 ()
1827
1828 let env_has_default_reduction env =
1829 state_has_default_reduction env.current
1830
1831 (* ------------------------------------------------------------------------ *)
1832
1833 (* The following functions work at the level of environments (as opposed to
1834 checkpoints). The function [pop] causes the automaton to go back into the
1835 past, pretending that the last input symbol has never been read. The
1836 function [force_reduction] causes the automaton to re-interpret the past,
1837 by recognizing the right-hand side of a production and reducing this
1838 production. The function [feed] causes the automaton to progress into the
1839 future by pretending that a (terminal or nonterminal) symbol has been
1840 read. *)
1841
1842 (* The function [feed] would ideally be defined here. However, for this
1843 function to be type-safe, the GADT ['a symbol] is needed. For this
1844 reason, we move its definition to [InspectionTableInterpreter], where
1845 the inspection API is available. *)
1846
1847 (* [pop] pops one stack cell. It cannot go wrong. *)
1848
1849 let pop (env : 'a env) : 'a env option =
1850 let cell = env.stack in
1851 let next = cell.next in
1852 if next == cell then
1853 (* The stack is empty. *)
1854 None
1855 else
1856 (* The stack is nonempty. Pop off one cell. *)
1857 Some { env with stack = next; current = cell.state }
1858
1859 (* [force_reduction] is analogous to [reduce], except that it does not
1860 continue by calling [run env] or [initiate env]. Instead, it returns
1861 [env] to the user. *)
1862
1863 (* [force_reduction] is dangerous insofar as it executes a semantic action.
1864 This semantic action could have side effects: nontermination, state,
1865 exceptions, input/output, etc. *)
1866
1867 let force_reduction prod (env : 'a env) : 'a env =
1868 (* Check if this reduction is permitted. This check is REALLY important.
1869 The stack must have the correct shape: that is, it must be sufficiently
1870 high, and must contain semantic values of appropriate types, otherwise
1871 the semantic action will crash and burn. *)
1872 (* We currently check whether the current state is WILLING to reduce this
1873 production (i.e., there is a reduction action in the action table row
1874 associated with this state), whereas it would be more liberal to check
1875 whether this state is CAPABLE of reducing this production (i.e., the
1876 stack has an appropriate shape). We currently have no means of
1877 performing such a check. *)
1878 if not (T.may_reduce env.current prod) then
1879 invalid_arg "force_reduction: this reduction is not permitted in this state"
1880 else begin
1881 (* We do not expose the start productions to the user, so this cannot be
1882 a start production. Hence, it has a semantic action. *)
1883 assert (not (T.is_start prod));
1884 (* Invoke the semantic action. *)
1885 let stack = T.semantic_action prod env in
1886 (* Perform a goto transition. *)
1887 let current = T.goto_prod stack.state prod in
1888 { env with stack; current }
1889 end
1890
1891 (* The environment manipulation functions -- [pop] and [force_reduction]
1892 above, plus [feed] -- manipulate the automaton's stack and current state,
1893 but do not affect the automaton's lookahead symbol. When the function
1894 [input_needed] is used to go back from an environment to a checkpoint
1895 (and therefore, resume normal parsing), the lookahead symbol is clobbered
1896 anyway, since the only action that the user can take is to call [offer].
1897 So far, so good. One problem, though, is that this call to [offer] may
1898 well place the automaton in a configuration of a state [s] and a
1899 lookahead symbol [t] that is normally unreachable. Also, perhaps the
1900 state [s] is a state where an input symbol normally is never demanded, so
1901 this [InputNeeded] checkpoint is fishy. There does not seem to be a deep
1902 problem here, but, when programming an error recovery strategy, one
1903 should pay some attention to this issue. Ideally, perhaps, one should use
1904 [input_needed] only in a state [s] where an input symbol is normally
1905 demanded, that is, a state [s] whose incoming symbol is a terminal symbol
1906 and which does not have a default reduction on [#]. *)
1907
1908 let input_needed (env : 'a env) : 'a checkpoint =
1909 InputNeeded env
1910
1911 (* The following functions are compositions of [top] and [pop]. *)
1912
1913 let rec pop_many i env =
1914 if i = 0 then
1915 Some env
1916 else match pop env with
1917 | None ->
1918 None
1919 | Some env ->
1920 pop_many (i - 1) env
1921
1922 let get i env =
1923 match pop_many i env with
1924 | None ->
1925 None
1926 | Some env ->
1927 top env
1928
1929 end
1930 end
1931 module ErrorReports = struct
1932 (******************************************************************************)
1933 (* *)
1934 (* Menhir *)
1935 (* *)
1936 (* François Pottier, Inria Paris *)
1937 (* Yann Régis-Gianas, PPS, Université Paris Diderot *)
1938 (* *)
1939 (* Copyright Inria. All rights reserved. This file is distributed under the *)
1940 (* terms of the GNU Library General Public License version 2, with a *)
1941 (* special exception on linking, as described in the file LICENSE. *)
1942 (* *)
1943 (******************************************************************************)
1944
1945 (* -------------------------------------------------------------------------- *)
1946
1947 (* A two-place buffer stores zero, one, or two elements. *)
1948
1949 type 'a content =
1950 | Zero
1951 | One of 'a
1952 | Two of 'a * (* most recent: *) 'a
1953
1954 type 'a buffer =
1955 'a content ref
1956
1957 (* [update buffer x] pushes [x] into [buffer], causing the buffer to slide. *)
1958
1959 let update buffer x =
1960 buffer :=
1961 match !buffer, x with
1962 | Zero, _ ->
1963 One x
1964 | One x1, x2
1965 | Two (_, x1), x2 ->
1966 Two (x1, x2)
1967
1968 (* [show f buffer] prints the contents of the buffer. The function [f] is
1969 used to print an element. *)
1970
1971 let show f buffer : string =
1972 match !buffer with
1973 | Zero ->
1974 (* The buffer cannot be empty. If we have read no tokens,
1975 we cannot have detected a syntax error. *)
1976 assert false
1977 | One invalid ->
1978 (* It is unlikely, but possible, that we have read just one token. *)
1979 Printf.sprintf "before '%s'" (f invalid)
1980 | Two (valid, invalid) ->
1981 (* In the most likely case, we have read two tokens. *)
1982 Printf.sprintf "after '%s' and before '%s'" (f valid) (f invalid)
1983
1984 (* [last buffer] returns the last element of the buffer (that is, the invalid
1985 token). *)
1986
1987 let last buffer =
1988 match !buffer with
1989 | Zero ->
1990 (* The buffer cannot be empty. If we have read no tokens,
1991 we cannot have detected a syntax error. *)
1992 assert false
1993 | One invalid
1994 | Two (_, invalid) ->
1995 invalid
1996
1997 (* [wrap buffer lexer] *)
1998
1999 open Lexing
2000
2001 let wrap lexer =
2002 let buffer = ref Zero in
2003 buffer,
2004 fun lexbuf ->
2005 let token = lexer lexbuf in
2006 update buffer (lexbuf.lex_start_p, lexbuf.lex_curr_p);
2007 token
2008
2009 (* -------------------------------------------------------------------------- *)
2010 end
2011 module Printers = struct
2012 (******************************************************************************)
2013 (* *)
2014 (* Menhir *)
2015 (* *)
2016 (* François Pottier, Inria Paris *)
2017 (* Yann Régis-Gianas, PPS, Université Paris Diderot *)
2018 (* *)
2019 (* Copyright Inria. All rights reserved. This file is distributed under the *)
2020 (* terms of the GNU Library General Public License version 2, with a *)
2021 (* special exception on linking, as described in the file LICENSE. *)
2022 (* *)
2023 (******************************************************************************)
2024
2025 module Make
2026 (I : IncrementalEngine.EVERYTHING)
2027 (User : sig
2028 val print: string -> unit
2029 val print_symbol: I.xsymbol -> unit
2030 val print_element: (I.element -> unit) option
2031 end)
2032 = struct
2033
2034 let arrow = " -> "
2035 let dot = "."
2036 let space = " "
2037 let newline = "\n"
2038
2039 open User
2040 open I
2041
2042 (* Printing a list of symbols. An optional dot is printed at offset
2043 [i] into the list [symbols], if this offset lies between [0] and
2044 the length of the list (included). *)
2045
2046 let rec print_symbols i symbols =
2047 if i = 0 then begin
2048 print dot;
2049 print space;
2050 print_symbols (-1) symbols
2051 end
2052 else begin
2053 match symbols with
2054 | [] ->
2055 ()
2056 | symbol :: symbols ->
2057 print_symbol symbol;
2058 print space;
2059 print_symbols (i - 1) symbols
2060 end
2061
2062 (* Printing an element as a symbol. *)
2063
2064 let print_element_as_symbol element =
2065 match element with
2066 | Element (s, _, _, _) ->
2067 print_symbol (X (incoming_symbol s))
2068
2069 (* Some of the functions that follow need an element printer. They use
2070 [print_element] if provided by the user; otherwise they use
2071 [print_element_as_symbol]. *)
2072
2073 let print_element =
2074 match print_element with
2075 | Some print_element ->
2076 print_element
2077 | None ->
2078 print_element_as_symbol
2079
2080 (* Printing a stack as a list of symbols. Stack bottom on the left,
2081 stack top on the right. *)
2082
2083 let rec print_stack env =
2084 match top env, pop env with
2085 | Some element, Some env ->
2086 print_stack env;
2087 print space;
2088 print_element element
2089 | _, _ ->
2090 ()
2091
2092 let print_stack env =
2093 print_stack env;
2094 print newline
2095
2096 (* Printing an item. *)
2097
2098 let print_item (prod, i) =
2099 print_symbol (lhs prod);
2100 print arrow;
2101 print_symbols i (rhs prod);
2102 print newline
2103
2104 (* Printing a list of symbols (public version). *)
2105
2106 let print_symbols symbols =
2107 print_symbols (-1) symbols
2108
2109 (* Printing a production (without a dot). *)
2110
2111 let print_production prod =
2112 print_item (prod, -1)
2113
2114 (* Printing the current LR(1) state. *)
2115
2116 let print_current_state env =
2117 print "Current LR(1) state: ";
2118 match top env with
2119 | None ->
2120 print "<some initial state>"; (* TEMPORARY unsatisfactory *)
2121 print newline
2122 | Some (Element (current, _, _, _)) ->
2123 print (string_of_int (number current));
2124 print newline;
2125 List.iter print_item (items current)
2126
2127 let print_env env =
2128 print_stack env;
2129 print_current_state env;
2130 print newline
2131
2132 end
2133 end
2134 module InfiniteArray = struct
2135 (******************************************************************************)
2136 (* *)
2137 (* Menhir *)
2138 (* *)
2139 (* François Pottier, Inria Paris *)
2140 (* Yann Régis-Gianas, PPS, Université Paris Diderot *)
2141 (* *)
2142 (* Copyright Inria. All rights reserved. This file is distributed under the *)
2143 (* terms of the GNU Library General Public License version 2, with a *)
2144 (* special exception on linking, as described in the file LICENSE. *)
2145 (* *)
2146 (******************************************************************************)
2147
2148 (** This module implements infinite arrays, that is, arrays that grow
2149 transparently upon demand. *)
2150
2151 type 'a t = {
2152 default: 'a;
2153 mutable table: 'a array;
2154 mutable extent: int; (* the index of the greatest [set] ever, plus one *)
2155 }
2156
2157 let default_size =
2158 16384 (* must be non-zero *)
2159
2160 let make x = {
2161 default = x;
2162 table = Array.make default_size x;
2163 extent = 0;
2164 }
2165
2166 let rec new_length length i =
2167 if i < length then
2168 length
2169 else
2170 new_length (2 * length) i
2171
2172 let ensure a i =
2173 assert (0 <= i);
2174 let table = a.table in
2175 let length = Array.length table in
2176 if i >= length then begin
2177 let table' = Array.make (new_length (2 * length) i) a.default in
2178 Array.blit table 0 table' 0 length;
2179 a.table <- table'
2180 end
2181
2182 let get a i =
2183 ensure a i;
2184 Array.unsafe_get a.table (i)
2185
2186 let set a i x =
2187 ensure a i;
2188 Array.unsafe_set a.table (i) x;
2189 if a.extent <= i then
2190 a.extent <- i + 1
2191
2192 let extent a =
2193 a.extent
2194
2195 let domain a =
2196 Array.sub a.table 0 a.extent
2197
2198 end
2199 module PackedIntArray = struct
2200 (******************************************************************************)
2201 (* *)
2202 (* Menhir *)
2203 (* *)
2204 (* François Pottier, Inria Paris *)
2205 (* Yann Régis-Gianas, PPS, Université Paris Diderot *)
2206 (* *)
2207 (* Copyright Inria. All rights reserved. This file is distributed under the *)
2208 (* terms of the GNU Library General Public License version 2, with a *)
2209 (* special exception on linking, as described in the file LICENSE. *)
2210 (* *)
2211 (******************************************************************************)
2212
2213 (* A packed integer array is represented as a pair of an integer [k] and
2214 a string [s]. The integer [k] is the number of bits per integer that we
2215 use. The string [s] is just an array of bits, which is read in 8-bit
2216 chunks. *)
2217
2218 (* The ocaml programming language treats string literals and array literals
2219 in slightly different ways: the former are statically allocated, while
2220 the latter are dynamically allocated. (This is rather arbitrary.) In the
2221 context of Menhir's table-based back-end, where compact, immutable
2222 integer arrays are needed, ocaml strings are preferable to ocaml arrays. *)
2223
2224 type t =
2225 int * string
2226
2227 (* The magnitude [k] of an integer [v] is the number of bits required
2228 to represent [v]. It is rounded up to the nearest power of two, so
2229 that [k] divides [Sys.word_size]. *)
2230
2231 let magnitude (v : int) =
2232 if v < 0 then
2233 Sys.word_size
2234 else
2235 let rec check k max = (* [max] equals [2^k] *)
2236 if (max <= 0) || (v < max) then
2237 k
2238 (* if [max] just overflew, then [v] requires a full ocaml
2239 integer, and [k] is the number of bits in an ocaml integer
2240 plus one, that is, [Sys.word_size]. *)
2241 else
2242 check (2 * k) (max * max)
2243 in
2244 check 1 2
2245
2246 (* [pack a] turns an array of integers into a packed integer array. *)
2247
2248 (* Because the sign bit is the most significant bit, the magnitude of
2249 any negative number is the word size. In other words, [pack] does
2250 not achieve any space savings as soon as [a] contains any negative
2251 numbers, even if they are ``small''. *)
2252
2253 let pack (a : int array) : t =
2254
2255 let m = Array.length a in
2256
2257 (* Compute the maximum magnitude of the array elements. This tells
2258 us how many bits per element we are going to use. *)
2259
2260 let k =
2261 Array.fold_left (fun k v ->
2262 max k (magnitude v)
2263 ) 1 a
2264 in
2265
2266 (* Because access to ocaml strings is performed on an 8-bit basis,
2267 two cases arise. If [k] is less than 8, then we can pack multiple
2268 array entries into a single character. If [k] is greater than 8,
2269 then we must use multiple characters to represent a single array
2270 entry. *)
2271
2272 if k <= 8 then begin
2273
2274 (* [w] is the number of array entries that we pack in a character. *)
2275
2276 assert (8 mod k = 0);
2277 let w = 8 / k in
2278
2279 (* [n] is the length of the string that we allocate. *)
2280
2281 let n =
2282 if m mod w = 0 then
2283 m / w
2284 else
2285 m / w + 1
2286 in
2287
2288 let s =
2289 Bytes.create n
2290 in
2291
2292 (* Define a reader for the source array. The reader might run off
2293 the end if [w] does not divide [m]. *)
2294
2295 let i = ref 0 in
2296 let next () =
2297 let ii = !i in
2298 if ii = m then
2299 0 (* ran off the end, pad with zeroes *)
2300 else
2301 let v = a.(ii) in
2302 i := ii + 1;
2303 v
2304 in
2305
2306 (* Fill up the string. *)
2307
2308 for j = 0 to n - 1 do
2309 let c = ref 0 in
2310 for _x = 1 to w do
2311 c := (!c lsl k) lor next()
2312 done;
2313 Bytes.set s j (Char.chr !c)
2314 done;
2315
2316 (* Done. *)
2317
2318 k, Bytes.unsafe_to_string s
2319
2320 end
2321 else begin (* k > 8 *)
2322
2323 (* [w] is the number of characters that we use to encode an array entry. *)
2324
2325 assert (k mod 8 = 0);
2326 let w = k / 8 in
2327
2328 (* [n] is the length of the string that we allocate. *)
2329
2330 let n =
2331 m * w
2332 in
2333
2334 let s =
2335 Bytes.create n
2336 in
2337
2338 (* Fill up the string. *)
2339
2340 for i = 0 to m - 1 do
2341 let v = ref a.(i) in
2342 for x = 1 to w do
2343 Bytes.set s ((i + 1) * w - x) (Char.chr (!v land 255));
2344 v := !v lsr 8
2345 done
2346 done;
2347
2348 (* Done. *)
2349
2350 k, Bytes.unsafe_to_string s
2351
2352 end
2353
2354 (* Access to a string. *)
2355
2356 let read (s : string) (i : int) : int =
2357 Char.code (String.unsafe_get s i)
2358
2359 (* [get1 t i] returns the integer stored in the packed array [t] at index [i].
2360 It assumes (and does not check) that the array's bit width is [1]. The
2361 parameter [t] is just a string. *)
2362
2363 let get1 (s : string) (i : int) : int =
2364 let c = read s (i lsr 3) in
2365 let c = c lsr ((lnot i) land 0b111) in
2366 let c = c land 0b1 in
2367 c
2368
2369 (* [get t i] returns the integer stored in the packed array [t] at index [i]. *)
2370
2371 (* Together, [pack] and [get] satisfy the following property: if the index [i]
2372 is within bounds, then [get (pack a) i] equals [a.(i)]. *)
2373
2374 let get ((k, s) : t) (i : int) : int =
2375 match k with
2376 | 1 ->
2377 get1 s i
2378 | 2 ->
2379 let c = read s (i lsr 2) in
2380 let c = c lsr (2 * ((lnot i) land 0b11)) in
2381 let c = c land 0b11 in
2382 c
2383 | 4 ->
2384 let c = read s (i lsr 1) in
2385 let c = c lsr (4 * ((lnot i) land 0b1)) in
2386 let c = c land 0b1111 in
2387 c
2388 | 8 ->
2389 read s i
2390 | 16 ->
2391 let j = 2 * i in
2392 (read s j) lsl 8 + read s (j + 1)
2393 | _ ->
2394 assert (k = 32); (* 64 bits unlikely, not supported *)
2395 let j = 4 * i in
2396 (((read s j lsl 8) + read s (j + 1)) lsl 8 + read s (j + 2)) lsl 8 + read s (j + 3)
2397
2398 (* [unflatten1 (n, data) i j] accesses the two-dimensional bitmap
2399 represented by [(n, data)] at indices [i] and [j]. The integer
2400 [n] is the width of the bitmap; the string [data] is the second
2401 component of the packed array obtained by encoding the table as
2402 a one-dimensional array. *)
2403
2404 let unflatten1 (n, data) i j =
2405 get1 data (n * i + j)
2406
2407 end
2408 module RowDisplacement = struct
2409 (******************************************************************************)
2410 (* *)
2411 (* Menhir *)
2412 (* *)
2413 (* François Pottier, Inria Paris *)
2414 (* Yann Régis-Gianas, PPS, Université Paris Diderot *)
2415 (* *)
2416 (* Copyright Inria. All rights reserved. This file is distributed under the *)
2417 (* terms of the GNU Library General Public License version 2, with a *)
2418 (* special exception on linking, as described in the file LICENSE. *)
2419 (* *)
2420 (******************************************************************************)
2421
2422 (* This module compresses a two-dimensional table, where some values
2423 are considered insignificant, via row displacement. *)
2424
2425 (* This idea reportedly appears in Aho and Ullman's ``Principles
2426 of Compiler Design'' (1977). It is evaluated in Tarjan and Yao's
2427 ``Storing a Sparse Table'' (1979) and in Dencker, Dürre, and Heuft's
2428 ``Optimization of Parser Tables for Portable Compilers'' (1984). *)
2429
2430 (* A compressed table is represented as a pair of arrays. The
2431 displacement array is an array of offsets into the data array. *)
2432
2433 type 'a table =
2434 int array * (* displacement *)
2435 'a array (* data *)
2436
2437 (* In a natural version of this algorithm, displacements would be greater
2438 than (or equal to) [-n]. However, in the particular setting of Menhir,
2439 both arrays are intended to be compressed with [PackedIntArray], which
2440 does not efficiently support negative numbers. For this reason, we are
2441 careful not to produce negative displacements. *)
2442
2443 (* In order to avoid producing negative displacements, we simply use the
2444 least significant bit as the sign bit. This is implemented by [encode]
2445 and [decode] below. *)
2446
2447 (* One could also think, say, of adding [n] to every displacement, so as
2448 to ensure that all displacements are nonnegative. This would work, but
2449 would require [n] to be published, for use by the decoder. *)
2450
2451 let encode (displacement : int) : int =
2452 if displacement >= 0 then
2453 displacement lsl 1
2454 else
2455 (-displacement) lsl 1 + 1
2456
2457 let decode (displacement : int) : int =
2458 if displacement land 1 = 0 then
2459 displacement lsr 1
2460 else
2461 -(displacement lsr 1)
2462
2463 (* It is reasonable to assume that, as matrices grow large, their
2464 density becomes low, i.e., they have many insignificant entries.
2465 As a result, it is important to work with a sparse data structure
2466 for rows. We internally represent a row as a list of its
2467 significant entries, where each entry is a pair of a [j] index and
2468 an element. *)
2469
2470 type 'a row =
2471 (int * 'a) list
2472
2473 (* [compress equal insignificant dummy m n t] turns the two-dimensional table
2474 [t] into a compressed table. The parameter [equal] is equality of data
2475 values. The parameter [wildcard] tells which data values are insignificant,
2476 and can thus be overwritten with other values. The parameter [dummy] is
2477 used to fill holes in the data array. [m] and [n] are the integer
2478 dimensions of the table [t]. *)
2479
2480 let compress
2481 (equal : 'a -> 'a -> bool)
2482 (insignificant : 'a -> bool)
2483 (dummy : 'a)
2484 (m : int) (n : int)
2485 (t : 'a array array)
2486 : 'a table =
2487
2488 (* Be defensive. *)
2489
2490 assert (Array.length t = m);
2491 assert begin
2492 for i = 0 to m - 1 do
2493 assert (Array.length t.(i) = n)
2494 done;
2495 true
2496 end;
2497
2498 (* This turns a row-as-array into a row-as-sparse-list. The row is
2499 accompanied by its index [i] and by its rank (the number of its
2500 significant entries, that is, the length of the row-as-a-list. *)
2501
2502 let sparse (i : int) (line : 'a array) : int * int * 'a row (* index, rank, row *) =
2503
2504 let rec loop (j : int) (rank : int) (row : 'a row) =
2505 if j < 0 then
2506 i, rank, row
2507 else
2508 let x = line.(j) in
2509 if insignificant x then
2510 loop (j - 1) rank row
2511 else
2512 loop (j - 1) (1 + rank) ((j, x) :: row)
2513 in
2514
2515 loop (n - 1) 0 []
2516
2517 in
2518
2519 (* Construct an array of all rows, together with their index and rank. *)
2520
2521 let rows : (int * int * 'a row) array = (* index, rank, row *)
2522 Array.mapi sparse t
2523 in
2524
2525 (* Sort this array by decreasing rank. This does not have any impact
2526 on correctness, but reportedly improves compression. The
2527 intuitive idea is that rows with few significant elements are
2528 easy to fit, so they should be inserted last, after the problem
2529 has become quite constrained by fitting the heavier rows. This
2530 heuristic is attributed to Ziegler. *)
2531
2532 Array.fast_sort (fun (_, rank1, _) (_, rank2, _) ->
2533 compare rank2 rank1
2534 ) rows;
2535
2536 (* Allocate a one-dimensional array of displacements. *)
2537
2538 let displacement : int array =
2539 Array.make m 0
2540 in
2541
2542 (* Allocate a one-dimensional, infinite array of values. Indices
2543 into this array are written [k]. *)
2544
2545 let data : 'a InfiniteArray.t =
2546 InfiniteArray.make dummy
2547 in
2548
2549 (* Determine whether [row] fits at offset [k] within the current [data]
2550 array, up to extension of this array. *)
2551
2552 (* Note that this check always succeeds when [k] equals the length of
2553 the [data] array. Indeed, the loop is then skipped. This property
2554 guarantees the termination of the recursive function [fit] below. *)
2555
2556 let fits k (row : 'a row) : bool =
2557
2558 let d = InfiniteArray.extent data in
2559
2560 let rec loop = function
2561 | [] ->
2562 true
2563 | (j, x) :: row ->
2564
2565 (* [x] is a significant element. *)
2566
2567 (* By hypothesis, [k + j] is nonnegative. If it is greater than or
2568 equal to the current length of the data array, stop -- the row
2569 fits. *)
2570
2571 assert (k + j >= 0);
2572
2573 if k + j >= d then
2574 true
2575
2576 (* We now know that [k + j] is within bounds of the data
2577 array. Check whether it is compatible with the element [y] found
2578 there. If it is, continue. If it isn't, stop -- the row does not
2579 fit. *)
2580
2581 else
2582 let y = InfiniteArray.get data (k + j) in
2583 if insignificant y || equal x y then
2584 loop row
2585 else
2586 false
2587
2588 in
2589 loop row
2590
2591 in
2592
2593 (* Find the leftmost position where a row fits. *)
2594
2595 (* If the leftmost significant element in this row is at offset [j],
2596 then we can hope to fit as far left as [-j] -- so this element
2597 lands at offset [0] in the data array. *)
2598
2599 (* Note that displacements may be negative. This means that, for
2600 insignificant elements, accesses to the data array could fail: they could
2601 be out of bounds, either towards the left or towards the right. This is
2602 not a problem, as long as [get] is invoked only at significant
2603 elements. *)
2604
2605 let rec fit k row : int =
2606 if fits k row then
2607 k
2608 else
2609 fit (k + 1) row
2610 in
2611
2612 let fit row =
2613 match row with
2614 | [] ->
2615 0 (* irrelevant *)
2616 | (j, _) :: _ ->
2617 fit (-j) row
2618 in
2619
2620 (* Write [row] at (compatible) offset [k]. *)
2621
2622 let rec write k = function
2623 | [] ->
2624 ()
2625 | (j, x) :: row ->
2626 InfiniteArray.set data (k + j) x;
2627 write k row
2628 in
2629
2630 (* Iterate over the sorted array of rows. Fit and write each row at
2631 the leftmost compatible offset. Update the displacement table. *)
2632
2633 Array.iter (fun (i, _, row) ->
2634 let k = fit row in (* if [row] has leading insignificant elements, then [k] can be negative *)
2635 write k row;
2636 displacement.(i) <- encode k
2637 ) rows;
2638
2639 (* Return the compressed tables. *)
2640
2641 displacement, InfiniteArray.domain data
2642
2643 (* [get ct i j] returns the value found at indices [i] and [j] in the
2644 compressed table [ct]. This function call is permitted only if the
2645 value found at indices [i] and [j] in the original table is
2646 significant -- otherwise, it could fail abruptly. *)
2647
2648 (* Together, [compress] and [get] have the property that, if the value
2649 found at indices [i] and [j] in an uncompressed table [t] is
2650 significant, then [get (compress t) i j] is equal to that value. *)
2651
2652 let get (displacement, data) i j =
2653 assert (0 <= i && i < Array.length displacement);
2654 let k = decode displacement.(i) in
2655 assert (0 <= k + j && k + j < Array.length data);
2656 (* failure of this assertion indicates an attempt to access an
2657 insignificant element that happens to be mapped out of the bounds
2658 of the [data] array. *)
2659 data.(k + j)
2660
2661 (* [getget] is a variant of [get] which only requires read access,
2662 via accessors, to the two components of the table. *)
2663
2664 let getget get_displacement get_data (displacement, data) i j =
2665 let k = decode (get_displacement displacement i) in
2666 get_data data (k + j)
2667 end
2668 module LinearizedArray = struct
2669 (******************************************************************************)
2670 (* *)
2671 (* Menhir *)
2672 (* *)
2673 (* François Pottier, Inria Paris *)
2674 (* Yann Régis-Gianas, PPS, Université Paris Diderot *)
2675 (* *)
2676 (* Copyright Inria. All rights reserved. This file is distributed under the *)
2677 (* terms of the GNU Library General Public License version 2, with a *)
2678 (* special exception on linking, as described in the file LICENSE. *)
2679 (* *)
2680 (******************************************************************************)
2681
2682 (* The [entry] array contains offsets into the [data] array. It has [n+1]
2683 elements if the original (unencoded) array has [n] elements. The value
2684 of [entry.(n)] is the length of the [data] array. This convention is
2685 natural and allows avoiding a special case. *)
2686
2687 type 'a t =
2688 (* data: *) 'a array *
2689 (* entry: *) int array
2690
2691 let make (a : 'a array array) : 'a t =
2692 let n = Array.length a in
2693 (* Build the entry array. *)
2694 let size = ref 0 in
2695 let entry = Array.init (n + 1) (fun i ->
2696 let s = !size in
2697 if i < n then
2698 size := s + Array.length a.(i);
2699 s
2700 ) in
2701 assert (entry.(n) = !size);
2702 (* Build the data array. *)
2703 let i = ref 0
2704 and j = ref 0 in
2705 let data = Array.init !size (fun _ ->
2706 while !j = Array.length a.(!i) do
2707 i := !i + 1;
2708 j := 0;
2709 done;
2710 let x = a.(!i).(!j) in
2711 j := !j + 1;
2712 x
2713 ) in
2714 data, entry
2715
2716 let length ((_, entry) : 'a t) : int =
2717 Array.length entry
2718
2719 let row_length ((_, entry) : 'a t) i : int =
2720 entry.(i + 1) - entry.(i)
2721
2722 let row_length_via get_entry i =
2723 get_entry (i + 1) - get_entry i
2724
2725 let read ((data, entry) as la : 'a t) i j : 'a =
2726 assert (0 <= j && j < row_length la i);
2727 data.(entry.(i) + j)
2728
2729 let read_via get_data get_entry i j =
2730 assert (0 <= j && j < row_length_via get_entry i);
2731 get_data (get_entry i + j)
2732
2733 let write ((data, entry) as la : 'a t) i j (v : 'a) : unit =
2734 assert (0 <= j && j < row_length la i);
2735 data.(entry.(i) + j) <- v
2736
2737 let rec read_interval_via get_data i j =
2738 if i = j then
2739 []
2740 else
2741 get_data i :: read_interval_via get_data (i + 1) j
2742
2743 let read_row_via get_data get_entry i =
2744 read_interval_via get_data (get_entry i) (get_entry (i + 1))
2745
2746 let read_row ((data, entry) : 'a t) i : 'a list =
2747 read_row_via (Array.get data) (Array.get entry) i
2748
2749 end
2750 module TableFormat = struct
2751 (******************************************************************************)
2752 (* *)
2753 (* Menhir *)
2754 (* *)
2755 (* François Pottier, Inria Paris *)
2756 (* Yann Régis-Gianas, PPS, Université Paris Diderot *)
2757 (* *)
2758 (* Copyright Inria. All rights reserved. This file is distributed under the *)
2759 (* terms of the GNU Library General Public License version 2, with a *)
2760 (* special exception on linking, as described in the file LICENSE. *)
2761 (* *)
2762 (******************************************************************************)
2763
2764 (* This signature defines the format of the parse tables. It is used as
2765 an argument to [TableInterpreter.Make]. *)
2766
2767 module type TABLES = sig
2768
2769 (* This is the parser's type of tokens. *)
2770
2771 type token
2772
2773 (* This maps a token to its internal (generation-time) integer code. *)
2774
2775 val token2terminal: token -> int
2776
2777 (* This is the integer code for the error pseudo-token. *)
2778
2779 val error_terminal: int
2780
2781 (* This maps a token to its semantic value. *)
2782
2783 val token2value: token -> Obj.t
2784
2785 (* Traditionally, an LR automaton is described by two tables, namely, an
2786 action table and a goto table. See, for instance, the Dragon book.
2787
2788 The action table is a two-dimensional matrix that maps a state and a
2789 lookahead token to an action. An action is one of: shift to a certain
2790 state, reduce a certain production, accept, or fail.
2791
2792 The goto table is a two-dimensional matrix that maps a state and a
2793 non-terminal symbol to either a state or undefined. By construction, this
2794 table is sparse: its undefined entries are never looked up. A compression
2795 technique is free to overlap them with other entries.
2796
2797 In Menhir, things are slightly different. If a state has a default
2798 reduction on token [#], then that reduction must be performed without
2799 consulting the lookahead token. As a result, we must first determine
2800 whether that is the case, before we can obtain a lookahead token and use it
2801 as an index in the action table.
2802
2803 Thus, Menhir's tables are as follows.
2804
2805 A one-dimensional default reduction table maps a state to either ``no
2806 default reduction'' (encoded as: 0) or ``by default, reduce prod''
2807 (encoded as: 1 + prod). The action table is looked up only when there
2808 is no default reduction. *)
2809
2810 val default_reduction: PackedIntArray.t
2811
2812 (* Menhir follows Dencker, Dürre and Heuft, who point out that, although the
2813 action table is not sparse by nature (i.e., the error entries are
2814 significant), it can be made sparse by first factoring out a binary error
2815 matrix, then replacing the error entries in the action table with undefined
2816 entries. Thus:
2817
2818 A two-dimensional error bitmap maps a state and a terminal to either
2819 ``fail'' (encoded as: 0) or ``do not fail'' (encoded as: 1). The action
2820 table, which is now sparse, is looked up only in the latter case. *)
2821
2822 (* The error bitmap is flattened into a one-dimensional table; its width is
2823 recorded so as to allow indexing. The table is then compressed via
2824 [PackedIntArray]. The bit width of the resulting packed array must be
2825 [1], so it is not explicitly recorded. *)
2826
2827 (* The error bitmap does not contain a column for the [#] pseudo-terminal.
2828 Thus, its width is [Terminal.n - 1]. We exploit the fact that the integer
2829 code assigned to [#] is greatest: the fact that the right-most column
2830 in the bitmap is missing does not affect the code for accessing it. *)
2831
2832 val error: int (* width of the bitmap *) * string (* second component of [PackedIntArray.t] *)
2833
2834 (* A two-dimensional action table maps a state and a terminal to one of
2835 ``shift to state s and discard the current token'' (encoded as: s | 10),
2836 ``shift to state s without discarding the current token'' (encoded as: s |
2837 11), or ``reduce prod'' (encoded as: prod | 01). *)
2838
2839 (* The action table is first compressed via [RowDisplacement], then packed
2840 via [PackedIntArray]. *)
2841
2842 (* Like the error bitmap, the action table does not contain a column for the
2843 [#] pseudo-terminal. *)
2844
2845 val action: PackedIntArray.t * PackedIntArray.t
2846
2847 (* A one-dimensional lhs table maps a production to its left-hand side (a
2848 non-terminal symbol). *)
2849
2850 val lhs: PackedIntArray.t
2851
2852 (* A two-dimensional goto table maps a state and a non-terminal symbol to
2853 either undefined (encoded as: 0) or a new state s (encoded as: 1 + s). *)
2854
2855 (* The goto table is first compressed via [RowDisplacement], then packed
2856 via [PackedIntArray]. *)
2857
2858 val goto: PackedIntArray.t * PackedIntArray.t
2859
2860 (* The number of start productions. A production [prod] is a start
2861 production if and only if [prod < start] holds. This is also the
2862 number of start symbols. A nonterminal symbol [nt] is a start
2863 symbol if and only if [nt < start] holds. *)
2864
2865 val start: int
2866
2867 (* A one-dimensional semantic action table maps productions to semantic
2868 actions. The calling convention for semantic actions is described in
2869 [EngineTypes]. This table contains ONLY NON-START PRODUCTIONS, so the
2870 indexing is off by [start]. Be careful. *)
2871
2872 val semantic_action: ((int, Obj.t, token) EngineTypes.env ->
2873 (int, Obj.t) EngineTypes.stack) array
2874
2875 (* The parser defines its own [Error] exception. This exception can be
2876 raised by semantic actions and caught by the engine, and raised by the
2877 engine towards the final user. *)
2878
2879 exception Error
2880
2881 (* The parser indicates whether to generate a trace. Generating a
2882 trace requires two extra tables, which respectively map a
2883 terminal symbol and a production to a string. *)
2884
2885 val trace: (string array * string array) option
2886
2887 end
2888 end
2889 module InspectionTableFormat = struct
2890 (******************************************************************************)
2891 (* *)
2892 (* Menhir *)
2893 (* *)
2894 (* François Pottier, Inria Paris *)
2895 (* Yann Régis-Gianas, PPS, Université Paris Diderot *)
2896 (* *)
2897 (* Copyright Inria. All rights reserved. This file is distributed under the *)
2898 (* terms of the GNU Library General Public License version 2, with a *)
2899 (* special exception on linking, as described in the file LICENSE. *)
2900 (* *)
2901 (******************************************************************************)
2902
2903 (* This signature defines the format of the tables that are produced (in
2904 addition to the tables described in [TableFormat]) when the command line
2905 switch [--inspection] is enabled. It is used as an argument to
2906 [InspectionTableInterpreter.Make]. *)
2907
2908 module type TABLES = sig
2909
2910 (* The types of symbols. *)
2911
2912 include IncrementalEngine.SYMBOLS
2913
2914 (* The type ['a lr1state] describes an LR(1) state. The generated parser defines
2915 it internally as [int]. *)
2916
2917 type 'a lr1state
2918
2919 (* Some of the tables that follow use encodings of (terminal and
2920 nonterminal) symbols as integers. So, we need functions that
2921 map the integer encoding of a symbol to its algebraic encoding. *)
2922
2923 val terminal: int -> xsymbol
2924 val nonterminal: int -> xsymbol
2925
2926 (* The left-hand side of every production already appears in the
2927 signature [TableFormat.TABLES], so we need not repeat it here. *)
2928
2929 (* The right-hand side of every production. This a linearized array
2930 of arrays of integers, whose [data] and [entry] components have
2931 been packed. The encoding of symbols as integers in described in
2932 [TableBackend]. *)
2933
2934 val rhs: PackedIntArray.t * PackedIntArray.t
2935
2936 (* A mapping of every (non-initial) state to its LR(0) core. *)
2937
2938 val lr0_core: PackedIntArray.t
2939
2940 (* A mapping of every LR(0) state to its set of LR(0) items. Each item is
2941 represented in its packed form (see [Item]) as an integer. Thus the
2942 mapping is an array of arrays of integers, which is linearized and
2943 packed, like [rhs]. *)
2944
2945 val lr0_items: PackedIntArray.t * PackedIntArray.t
2946
2947 (* A mapping of every LR(0) state to its incoming symbol, if it has one. *)
2948
2949 val lr0_incoming: PackedIntArray.t
2950
2951 (* A table that tells which non-terminal symbols are nullable. *)
2952
2953 val nullable: string
2954 (* This is a packed int array of bit width 1. It can be read
2955 using [PackedIntArray.get1]. *)
2956
2957 (* A two-table dimensional table, indexed by a nonterminal symbol and
2958 by a terminal symbol (other than [#]), encodes the FIRST sets. *)
2959
2960 val first: int (* width of the bitmap *) * string (* second component of [PackedIntArray.t] *)
2961
2962 end
2963
2964 end
2965 module InspectionTableInterpreter = struct
2966 (******************************************************************************)
2967 (* *)
2968 (* Menhir *)
2969 (* *)
2970 (* François Pottier, Inria Paris *)
2971 (* Yann Régis-Gianas, PPS, Université Paris Diderot *)
2972 (* *)
2973 (* Copyright Inria. All rights reserved. This file is distributed under the *)
2974 (* terms of the GNU Library General Public License version 2, with a *)
2975 (* special exception on linking, as described in the file LICENSE. *)
2976 (* *)
2977 (******************************************************************************)
2978
2979 (* -------------------------------------------------------------------------- *)
2980
2981 (* The type functor. *)
2982
2983 module Symbols (T : sig
2984
2985 type 'a terminal
2986 type 'a nonterminal
2987
2988 end) = struct
2989
2990 open T
2991
2992 (* This should be the only place in the whole library (and generator!)
2993 where these types are defined. *)
2994
2995 type 'a symbol =
2996 | T : 'a terminal -> 'a symbol
2997 | N : 'a nonterminal -> 'a symbol
2998
2999 type xsymbol =
3000 | X : 'a symbol -> xsymbol
3001
3002 end
3003
3004 (* -------------------------------------------------------------------------- *)
3005
3006 (* The code functor. *)
3007
3008 module Make
3009 (TT : TableFormat.TABLES)
3010 (IT : InspectionTableFormat.TABLES
3011 with type 'a lr1state = int)
3012 (ET : EngineTypes.TABLE
3013 with type terminal = int
3014 and type nonterminal = int
3015 and type semantic_value = Obj.t)
3016 (E : sig
3017 type 'a env = (ET.state, ET.semantic_value, ET.token) EngineTypes.env
3018 end)
3019 = struct
3020
3021 (* Including [IT] is an easy way of inheriting the definitions of the types
3022 [symbol] and [xsymbol]. *)
3023
3024 include IT
3025
3026 (* This auxiliary function decodes a packed linearized array, as created by
3027 [TableBackend.linearize_and_marshal1]. Here, we read a row all at once. *)
3028
3029 let read_packed_linearized
3030 (data, entry : PackedIntArray.t * PackedIntArray.t) (i : int) : int list
3031 =
3032 LinearizedArray.read_row_via
3033 (PackedIntArray.get data)
3034 (PackedIntArray.get entry)
3035 i
3036
3037 (* This auxiliary function decodes a symbol. The encoding was done by
3038 [encode_symbol] or [encode_symbol_option] in the table back-end. *)
3039
3040 let decode_symbol (symbol : int) : IT.xsymbol =
3041 (* If [symbol] is 0, then we have no symbol. This could mean e.g.
3042 that the function [incoming_symbol] has been applied to an
3043 initial state. In principle, this cannot happen. *)
3044 assert (symbol > 0);
3045 (* The low-order bit distinguishes terminal and nonterminal symbols. *)
3046 let kind = symbol land 1 in
3047 let symbol = symbol lsr 1 in
3048 if kind = 0 then
3049 IT.terminal (symbol - 1)
3050 else
3051 IT.nonterminal symbol
3052
3053 (* These auxiliary functions convert a symbol to its integer code. For speed
3054 and for convenience, we use an unsafe type cast. This relies on the fact
3055 that the data constructors of the [terminal] and [nonterminal] GADTs are
3056 declared in an order that reflects their internal code. In the case of
3057 nonterminal symbols, we add [start] to account for the presence of the
3058 start symbols. *)
3059
3060 let n2i (nt : 'a IT.nonterminal) : int =
3061 let answer = TT.start + Obj.magic nt in
3062 (* For safety, check that the above cast produced a correct result. *)
3063 assert (IT.nonterminal answer = X (N nt));
3064 answer
3065
3066 let t2i (t : 'a IT.terminal) : int =
3067 let answer = Obj.magic t in
3068 (* For safety, check that the above cast produced a correct result. *)
3069 assert (IT.terminal answer = X (T t));
3070 answer
3071
3072 (* Ordering functions. *)
3073
3074 let compare_terminals t1 t2 =
3075 (* Subtraction is safe because overflow is impossible. *)
3076 t2i t1 - t2i t2
3077
3078 let compare_nonterminals nt1 nt2 =
3079 (* Subtraction is safe because overflow is impossible. *)
3080 n2i nt1 - n2i nt2
3081
3082 let compare_symbols symbol1 symbol2 =
3083 match symbol1, symbol2 with
3084 | X (T _), X (N _) ->
3085 -1
3086 | X (N _), X (T _) ->
3087 1
3088 | X (T t1), X (T t2) ->
3089 compare_terminals t1 t2
3090 | X (N nt1), X (N nt2) ->
3091 compare_nonterminals nt1 nt2
3092
3093 let compare_productions prod1 prod2 =
3094 (* Subtraction is safe because overflow is impossible. *)
3095 prod1 - prod2
3096
3097 let compare_items (prod1, index1) (prod2, index2) =
3098 let c = compare_productions prod1 prod2 in
3099 (* Subtraction is safe because overflow is impossible. *)
3100 if c <> 0 then c else index1 - index2
3101
3102 (* The function [incoming_symbol] goes through the tables [IT.lr0_core] and
3103 [IT.lr0_incoming]. This yields a representation of type [xsymbol], out of
3104 which we strip the [X] quantifier, so as to get a naked symbol. This last
3105 step is ill-typed and potentially dangerous. It is safe only because this
3106 function is used at type ['a lr1state -> 'a symbol], which forces an
3107 appropriate choice of ['a]. *)
3108
3109 let incoming_symbol (s : 'a IT.lr1state) : 'a IT.symbol =
3110 let core = PackedIntArray.get IT.lr0_core s in
3111 let symbol = decode_symbol (PackedIntArray.get IT.lr0_incoming core) in
3112 match symbol with
3113 | IT.X symbol ->
3114 Obj.magic symbol
3115
3116 (* The function [lhs] reads the table [TT.lhs] and uses [IT.nonterminal]
3117 to decode the symbol. *)
3118
3119 let lhs prod =
3120 IT.nonterminal (PackedIntArray.get TT.lhs prod)
3121
3122 (* The function [rhs] reads the table [IT.rhs] and uses [decode_symbol]
3123 to decode the symbol. *)
3124
3125 let rhs prod =
3126 List.map decode_symbol (read_packed_linearized IT.rhs prod)
3127
3128 (* The function [items] maps the LR(1) state [s] to its LR(0) core,
3129 then uses [core] as an index into the table [IT.lr0_items]. The
3130 items are then decoded by the function [export] below, which is
3131 essentially a copy of [Item.export]. *)
3132
3133 type item =
3134 int * int
3135
3136 let export t : item =
3137 (t lsr 7, t mod 128)
3138
3139 let items s =
3140 (* Map [s] to its LR(0) core. *)
3141 let core = PackedIntArray.get IT.lr0_core s in
3142 (* Now use [core] to look up the table [IT.lr0_items]. *)
3143 List.map export (read_packed_linearized IT.lr0_items core)
3144
3145 (* The function [nullable] maps the nonterminal symbol [nt] to its
3146 integer code, which it uses to look up the array [IT.nullable].
3147 This yields 0 or 1, which we map back to a Boolean result. *)
3148
3149 let decode_bool i =
3150 assert (i = 0 || i = 1);
3151 i = 1
3152
3153 let nullable nt =
3154 decode_bool (PackedIntArray.get1 IT.nullable (n2i nt))
3155
3156 (* The function [first] maps the symbols [nt] and [t] to their integer
3157 codes, which it uses to look up the matrix [IT.first]. *)
3158
3159 let first nt t =
3160 decode_bool (PackedIntArray.unflatten1 IT.first (n2i nt) (t2i t))
3161
3162 let xfirst symbol t =
3163 match symbol with
3164 | X (T t') ->
3165 compare_terminals t t' = 0
3166 | X (N nt) ->
3167 first nt t
3168
3169 (* The function [foreach_terminal] exploits the fact that the
3170 first component of [TT.error] is [Terminal.n - 1], i.e., the
3171 number of terminal symbols, including [error] but not [#]. *)
3172
3173 let rec foldij i j f accu =
3174 if i = j then
3175 accu
3176 else
3177 foldij (i + 1) j f (f i accu)
3178
3179 let foreach_terminal f accu =
3180 let n, _ = TT.error in
3181 foldij 0 n (fun i accu ->
3182 f (IT.terminal i) accu
3183 ) accu
3184
3185 let foreach_terminal_but_error f accu =
3186 let n, _ = TT.error in
3187 foldij 0 n (fun i accu ->
3188 if i = TT.error_terminal then
3189 accu
3190 else
3191 f (IT.terminal i) accu
3192 ) accu
3193
3194 (* ------------------------------------------------------------------------ *)
3195
3196 (* The following is the implementation of the function [feed]. This function
3197 is logically part of the LR engine, so it would be nice if it were placed
3198 in the module [Engine], but it must be placed here because, to ensure
3199 type safety, its arguments must be a symbol of type ['a symbol] and a
3200 semantic value of type ['a]. The type ['a symbol] is not available in
3201 [Engine]. It is available here. *)
3202
3203 open EngineTypes
3204 open ET
3205 open E
3206
3207 (* [feed] fails if the current state does not have an outgoing transition
3208 labeled with the desired symbol. This check is carried out at runtime. *)
3209
3210 let feed_failure () =
3211 invalid_arg "feed: outgoing transition does not exist"
3212
3213 (* Feeding a nonterminal symbol [nt]. Here, [nt] has type [nonterminal],
3214 which is a synonym for [int], and [semv] has type [semantic_value],
3215 which is a synonym for [Obj.t]. This type is unsafe, because pushing
3216 a semantic value of arbitrary type into the stack can later cause a
3217 semantic action to crash and burn. The function [feed] is given a safe
3218 type below. *)
3219
3220 let feed_nonterminal
3221 (nt : nonterminal) startp (semv : semantic_value) endp (env : 'b env)
3222 : 'b env
3223 =
3224 (* Check if the source state has an outgoing transition labeled [nt].
3225 This is done by consulting the [goto] table. *)
3226 let source = env.current in
3227 match ET.maybe_goto_nt source nt with
3228 | None ->
3229 feed_failure()
3230 | Some target ->
3231 (* Push a new cell onto the stack, containing the identity of the state
3232 that we are leaving. The semantic value [semv] and positions [startp]
3233 and [endp] contained in the new cell are provided by the caller. *)
3234 let stack = { state = source; semv; startp; endp; next = env.stack } in
3235 (* Move to the target state. *)
3236 { env with stack; current = target }
3237
3238 let reduce _env _prod = feed_failure()
3239 let initiate _env = feed_failure()
3240
3241 let feed_terminal
3242 (terminal : terminal) startp (semv : semantic_value) endp (env : 'b env)
3243 : 'b env
3244 =
3245 (* Check if the source state has an outgoing transition labeled [terminal].
3246 This is done by consulting the [action] table. *)
3247 let source = env.current in
3248 ET.action source terminal semv
3249 (fun env _please_discard _terminal semv target ->
3250 (* There is indeed a transition toward the state [target].
3251 Push a new cell onto the stack and move to the target state. *)
3252 let stack = { state = source; semv; startp; endp; next = env.stack } in
3253 { env with stack; current = target }
3254 ) reduce initiate env
3255
3256 (* The type assigned to [feed] ensures that the type of the semantic value
3257 [semv] is appropriate: it must be the semantic-value type of the symbol
3258 [symbol]. *)
3259
3260 let feed (symbol : 'a symbol) startp (semv : 'a) endp env =
3261 let semv : semantic_value = Obj.repr semv in
3262 match symbol with
3263 | N nt ->
3264 feed_nonterminal (n2i nt) startp semv endp env
3265 | T terminal ->
3266 feed_terminal (t2i terminal) startp semv endp env
3267
3268 end
3269 end
3270 module TableInterpreter = struct
3271 (******************************************************************************)
3272 (* *)
3273 (* Menhir *)
3274 (* *)
3275 (* François Pottier, Inria Paris *)
3276 (* Yann Régis-Gianas, PPS, Université Paris Diderot *)
3277 (* *)
3278 (* Copyright Inria. All rights reserved. This file is distributed under the *)
3279 (* terms of the GNU Library General Public License version 2, with a *)
3280 (* special exception on linking, as described in the file LICENSE. *)
3281 (* *)
3282 (******************************************************************************)
3283
3284 module MakeEngineTable (T : TableFormat.TABLES) = struct
3285
3286 type state =
3287 int
3288
3289 let number s = s
3290
3291 type token =
3292 T.token
3293
3294 type terminal =
3295 int
3296
3297 type nonterminal =
3298 int
3299
3300 type semantic_value =
3301 Obj.t
3302
3303 let token2terminal =
3304 T.token2terminal
3305
3306 let token2value =
3307 T.token2value
3308
3309 let error_terminal =
3310 T.error_terminal
3311
3312 let error_value =
3313 Obj.repr ()
3314
3315 (* The function [foreach_terminal] exploits the fact that the
3316 first component of [T.error] is [Terminal.n - 1], i.e., the
3317 number of terminal symbols, including [error] but not [#]. *)
3318
3319 (* There is similar code in [InspectionTableInterpreter]. The
3320 code there contains an additional conversion of the type
3321 [terminal] to the type [xsymbol]. *)
3322
3323 let rec foldij i j f accu =
3324 if i = j then
3325 accu
3326 else
3327 foldij (i + 1) j f (f i accu)
3328
3329 let foreach_terminal f accu =
3330 let n, _ = T.error in
3331 foldij 0 n (fun i accu ->
3332 f i accu
3333 ) accu
3334
3335 type production =
3336 int
3337
3338 (* In principle, only non-start productions are exposed to the user,
3339 at type [production] or at type [int]. This is checked dynamically. *)
3340 let non_start_production i =
3341 assert (T.start <= i && i - T.start < Array.length T.semantic_action)
3342
3343 let production_index i =
3344 non_start_production i;
3345 i
3346
3347 let find_production i =
3348 non_start_production i;
3349 i
3350
3351 let default_reduction state defred nodefred env =
3352 let code = PackedIntArray.get T.default_reduction state in
3353 if code = 0 then
3354 nodefred env
3355 else
3356 defred env (code - 1)
3357
3358 let is_start prod =
3359 prod < T.start
3360
3361 (* This auxiliary function helps access a compressed, two-dimensional
3362 matrix, like the action and goto tables. *)
3363
3364 let unmarshal2 table i j =
3365 RowDisplacement.getget
3366 PackedIntArray.get
3367 PackedIntArray.get
3368 table
3369 i j
3370
3371 let action state terminal value shift reduce fail env =
3372 match PackedIntArray.unflatten1 T.error state terminal with
3373 | 1 ->
3374 let action = unmarshal2 T.action state terminal in
3375 let opcode = action land 0b11
3376 and param = action lsr 2 in
3377 if opcode >= 0b10 then
3378 (* 0b10 : shift/discard *)
3379 (* 0b11 : shift/nodiscard *)
3380 let please_discard = (opcode = 0b10) in
3381 shift env please_discard terminal value param
3382 else
3383 (* 0b01 : reduce *)
3384 (* 0b00 : cannot happen *)
3385 reduce env param
3386 | c ->
3387 assert (c = 0);
3388 fail env
3389
3390 let goto_nt state nt =
3391 let code = unmarshal2 T.goto state nt in
3392 (* code = 1 + state *)
3393 code - 1
3394
3395 let goto_prod state prod =
3396 goto_nt state (PackedIntArray.get T.lhs prod)
3397
3398 let maybe_goto_nt state nt =
3399 let code = unmarshal2 T.goto state nt in
3400 (* If [code] is 0, there is no outgoing transition.
3401 If [code] is [1 + state], there is a transition towards [state]. *)
3402 assert (0 <= code);
3403 if code = 0 then None else Some (code - 1)
3404
3405 exception Error =
3406 T.Error
3407
3408 type semantic_action =
3409 (state, semantic_value, token) EngineTypes.env ->
3410 (state, semantic_value) EngineTypes.stack
3411
3412 let semantic_action prod =
3413 (* Indexing into the array [T.semantic_action] is off by [T.start],
3414 because the start productions do not have entries in this array. *)
3415 T.semantic_action.(prod - T.start)
3416
3417 (* [may_reduce state prod] tests whether the state [state] is capable of
3418 reducing the production [prod]. This information could be determined
3419 in constant time if we were willing to create a bitmap for it, but
3420 that would take up a lot of space. Instead, we obtain this information
3421 by iterating over a line in the action table. This is costly, but this
3422 function is not normally used by the LR engine anyway; it is supposed
3423 to be used only by programmers who wish to develop error recovery
3424 strategies. *)
3425
3426 (* In the future, if desired, we could memoize this function, so as
3427 to pay the cost in (memory) space only if and where this function
3428 is actually used. We could also replace [foreach_terminal] with a
3429 function [exists_terminal] which stops as soon as the accumulator
3430 is [true]. *)
3431
3432 let may_reduce state prod =
3433 (* Test if there is a default reduction of [prod]. *)
3434 default_reduction state
3435 (fun () prod' -> prod = prod')
3436 (fun () ->
3437 (* If not, then for each terminal [t], ... *)
3438 foreach_terminal (fun t accu ->
3439 accu ||
3440 (* ... test if there is a reduction of [prod] on [t]. *)
3441 action state t ()
3442 (* shift: *) (fun () _ _ () _ -> false)
3443 (* reduce: *) (fun () prod' -> prod = prod')
3444 (* fail: *) (fun () -> false)
3445 ()
3446 ) false
3447 )
3448 ()
3449
3450 (* If [T.trace] is [None], then the logging functions do nothing. *)
3451
3452 let log =
3453 match T.trace with Some _ -> true | None -> false
3454
3455 module Log = struct
3456
3457 open Printf
3458
3459 let state state =
3460 match T.trace with
3461 | Some _ ->
3462 fprintf stderr "State %d:\n%!" state
3463 | None ->
3464 ()
3465
3466 let shift terminal state =
3467 match T.trace with
3468 | Some (terminals, _) ->
3469 fprintf stderr "Shifting (%s) to state %d\n%!" terminals.(terminal) state
3470 | None ->
3471 ()
3472
3473 let reduce_or_accept prod =
3474 match T.trace with
3475 | Some (_, productions) ->
3476 fprintf stderr "%s\n%!" productions.(prod)
3477 | None ->
3478 ()
3479
3480 let lookahead_token token startp endp =
3481 match T.trace with
3482 | Some (terminals, _) ->
3483 fprintf stderr "Lookahead token is now %s (%d-%d)\n%!"
3484 terminals.(token)
3485 startp.Lexing.pos_cnum
3486 endp.Lexing.pos_cnum
3487 | None ->
3488 ()
3489
3490 let initiating_error_handling () =
3491 match T.trace with
3492 | Some _ ->
3493 fprintf stderr "Initiating error handling\n%!"
3494 | None ->
3495 ()
3496
3497 let resuming_error_handling () =
3498 match T.trace with
3499 | Some _ ->
3500 fprintf stderr "Resuming error handling\n%!"
3501 | None ->
3502 ()
3503
3504 let handling_error state =
3505 match T.trace with
3506 | Some _ ->
3507 fprintf stderr "Handling error in state %d\n%!" state
3508 | None ->
3509 ()
3510
3511 end
3512
3513 end
3514 end
3515 module StaticVersion = struct
3516 let require_20181113 = ()
3517 end
3518