1706 lines | 69573 chars
1 | module General : sig |
2 | (******************************************************************************) |
3 | (* *) |
4 | (* Menhir *) |
5 | (* *) |
6 | (* François Pottier, Inria Paris *) |
7 | (* Yann Régis-Gianas, PPS, Université Paris Diderot *) |
8 | (* *) |
9 | (* Copyright Inria. All rights reserved. This file is distributed under the *) |
10 | (* terms of the GNU Library General Public License version 2, with a *) |
11 | (* special exception on linking, as described in the file LICENSE. *) |
12 | (* *) |
13 | (******************************************************************************) |
14 | |
15 | (* This module offers general-purpose functions on lists and streams. *) |
16 | |
17 | (* As of 2017/03/31, this module is DEPRECATED. It might be removed in |
18 | the future. *) |
19 | |
20 | (* --------------------------------------------------------------------------- *) |
21 | |
22 | (* Lists. *) |
23 | |
24 | (* [take n xs] returns the [n] first elements of the list [xs]. It is |
25 | acceptable for the list [xs] to have length less than [n], in |
26 | which case [xs] itself is returned. *) |
27 | |
28 | val take: int -> 'a list -> 'a list |
29 | |
30 | (* [drop n xs] returns the list [xs], deprived of its [n] first elements. |
31 | It is acceptable for the list [xs] to have length less than [n], in |
32 | which case an empty list is returned. *) |
33 | |
34 | val drop: int -> 'a list -> 'a list |
35 | |
36 | (* [uniq cmp xs] assumes that the list [xs] is sorted according to the |
37 | ordering [cmp] and returns the list [xs] deprived of any duplicate |
38 | elements. *) |
39 | |
40 | val uniq: ('a -> 'a -> int) -> 'a list -> 'a list |
41 | |
42 | (* [weed cmp xs] returns the list [xs] deprived of any duplicate elements. *) |
43 | |
44 | val weed: ('a -> 'a -> int) -> 'a list -> 'a list |
45 | |
46 | (* --------------------------------------------------------------------------- *) |
47 | |
48 | (* A stream is a list whose elements are produced on demand. *) |
49 | |
50 | type 'a stream = |
51 | 'a head Lazy.t |
52 | |
53 | and 'a head = |
54 | | Nil |
55 | | Cons of 'a * 'a stream |
56 | |
57 | (* The length of a stream. *) |
58 | |
59 | val length: 'a stream -> int |
60 | |
61 | (* Folding over a stream. *) |
62 | |
63 | val foldr: ('a -> 'b -> 'b) -> 'a stream -> 'b -> 'b |
64 | end |
65 | module Convert : sig |
66 | (******************************************************************************) |
67 | (* *) |
68 | (* Menhir *) |
69 | (* *) |
70 | (* François Pottier, Inria Paris *) |
71 | (* Yann Régis-Gianas, PPS, Université Paris Diderot *) |
72 | (* *) |
73 | (* Copyright Inria. All rights reserved. This file is distributed under the *) |
74 | (* terms of the GNU Library General Public License version 2, with a *) |
75 | (* special exception on linking, as described in the file LICENSE. *) |
76 | (* *) |
77 | (******************************************************************************) |
78 | |
79 | (* An ocamlyacc-style, or Menhir-style, parser requires access to |
80 | the lexer, which must be parameterized with a lexing buffer, and |
81 | to the lexing buffer itself, where it reads position information. *) |
82 | |
83 | (* This traditional API is convenient when used with ocamllex, but |
84 | inelegant when used with other lexer generators. *) |
85 | |
86 | type ('token, 'semantic_value) traditional = |
87 | (Lexing.lexbuf -> 'token) -> Lexing.lexbuf -> 'semantic_value |
88 | |
89 | (* This revised API is independent of any lexer generator. Here, the |
90 | parser only requires access to the lexer, and the lexer takes no |
91 | parameters. The tokens returned by the lexer may contain position |
92 | information. *) |
93 | |
94 | type ('token, 'semantic_value) revised = |
95 | (unit -> 'token) -> 'semantic_value |
96 | |
97 | (* --------------------------------------------------------------------------- *) |
98 | |
99 | (* Converting a traditional parser, produced by ocamlyacc or Menhir, |
100 | into a revised parser. *) |
101 | |
102 | (* A token of the revised lexer is essentially a triple of a token |
103 | of the traditional lexer (or raw token), a start position, and |
104 | and end position. The three [get] functions are accessors. *) |
105 | |
106 | (* We do not require the type ['token] to actually be a triple type. |
107 | This enables complex applications where it is a record type with |
108 | more than three fields. It also enables simple applications where |
109 | positions are of no interest, so ['token] is just ['raw_token] |
110 | and [get_startp] and [get_endp] return dummy positions. *) |
111 | |
112 | val traditional2revised: |
113 | ('token -> 'raw_token) -> |
114 | ('token -> Lexing.position) -> |
115 | ('token -> Lexing.position) -> |
116 | ('raw_token, 'semantic_value) traditional -> |
117 | ('token, 'semantic_value) revised |
118 | |
119 | (* --------------------------------------------------------------------------- *) |
120 | |
121 | (* Converting a revised parser back to a traditional parser. *) |
122 | |
123 | val revised2traditional: |
124 | ('raw_token -> Lexing.position -> Lexing.position -> 'token) -> |
125 | ('token, 'semantic_value) revised -> |
126 | ('raw_token, 'semantic_value) traditional |
127 | |
128 | (* --------------------------------------------------------------------------- *) |
129 | |
130 | (* Simplified versions of the above, where concrete triples are used. *) |
131 | |
132 | module Simplified : sig |
133 | |
134 | val traditional2revised: |
135 | ('token, 'semantic_value) traditional -> |
136 | ('token * Lexing.position * Lexing.position, 'semantic_value) revised |
137 | |
138 | val revised2traditional: |
139 | ('token * Lexing.position * Lexing.position, 'semantic_value) revised -> |
140 | ('token, 'semantic_value) traditional |
141 | |
142 | end |
143 | end |
144 | module IncrementalEngine : sig |
145 | (******************************************************************************) |
146 | (* *) |
147 | (* Menhir *) |
148 | (* *) |
149 | (* François Pottier, Inria Paris *) |
150 | (* Yann Régis-Gianas, PPS, Université Paris Diderot *) |
151 | (* *) |
152 | (* Copyright Inria. All rights reserved. This file is distributed under the *) |
153 | (* terms of the GNU Library General Public License version 2, with a *) |
154 | (* special exception on linking, as described in the file LICENSE. *) |
155 | (* *) |
156 | (******************************************************************************) |
157 | |
158 | type position = Lexing.position |
159 | |
160 | open General |
161 | |
162 | (* This signature describes the incremental LR engine. *) |
163 | |
164 | (* In this mode, the user controls the lexer, and the parser suspends |
165 | itself when it needs to read a new token. *) |
166 | |
167 | module type INCREMENTAL_ENGINE = sig |
168 | |
169 | type token |
170 | |
171 | (* A value of type [production] is (an index for) a production. The start |
172 | productions (which do not exist in an \mly file, but are constructed by |
173 | Menhir internally) are not part of this type. *) |
174 | |
175 | type production |
176 | |
177 | (* The type ['a checkpoint] represents an intermediate or final state of the |
178 | parser. An intermediate checkpoint is a suspension: it records the parser's |
179 | current state, and allows parsing to be resumed. The parameter ['a] is |
180 | the type of the semantic value that will eventually be produced if the |
181 | parser succeeds. *) |
182 | |
183 | (* [Accepted] and [Rejected] are final checkpoints. [Accepted] carries a |
184 | semantic value. *) |
185 | |
186 | (* [InputNeeded] is an intermediate checkpoint. It means that the parser wishes |
187 | to read one token before continuing. *) |
188 | |
189 | (* [Shifting] is an intermediate checkpoint. It means that the parser is taking |
190 | a shift transition. It exposes the state of the parser before and after |
191 | the transition. The Boolean parameter tells whether the parser intends to |
192 | request a new token after this transition. (It always does, except when |
193 | it is about to accept.) *) |
194 | |
195 | (* [AboutToReduce] is an intermediate checkpoint. It means that the parser is |
196 | about to perform a reduction step. It exposes the parser's current |
197 | state as well as the production that is about to be reduced. *) |
198 | |
199 | (* [HandlingError] is an intermediate checkpoint. It means that the parser has |
200 | detected an error and is currently handling it, in several steps. *) |
201 | |
202 | (* A value of type ['a env] represents a configuration of the automaton: |
203 | current state, stack, lookahead token, etc. The parameter ['a] is the |
204 | type of the semantic value that will eventually be produced if the parser |
205 | succeeds. *) |
206 | |
207 | (* In normal operation, the parser works with checkpoints: see the functions |
208 | [offer] and [resume]. However, it is also possible to work directly with |
209 | environments (see the functions [pop], [force_reduction], and [feed]) and |
210 | to reconstruct a checkpoint out of an environment (see [input_needed]). |
211 | This is considered advanced functionality; its purpose is to allow error |
212 | recovery strategies to be programmed by the user. *) |
213 | |
214 | type 'a env |
215 | |
216 | type 'a checkpoint = private |
217 | | InputNeeded of 'a env |
218 | | Shifting of 'a env * 'a env * bool |
219 | | AboutToReduce of 'a env * production |
220 | | HandlingError of 'a env |
221 | | Accepted of 'a |
222 | | Rejected |
223 | |
224 | (* [offer] allows the user to resume the parser after it has suspended |
225 | itself with a checkpoint of the form [InputNeeded env]. [offer] expects the |
226 | old checkpoint as well as a new token and produces a new checkpoint. It does not |
227 | raise any exception. *) |
228 | |
229 | val offer: |
230 | 'a checkpoint -> |
231 | token * position * position -> |
232 | 'a checkpoint |
233 | |
234 | (* [resume] allows the user to resume the parser after it has suspended |
235 | itself with a checkpoint of the form [AboutToReduce (env, prod)] or |
236 | [HandlingError env]. [resume] expects the old checkpoint and produces a new |
237 | checkpoint. It does not raise any exception. *) |
238 | |
239 | val resume: |
240 | 'a checkpoint -> |
241 | 'a checkpoint |
242 | |
243 | (* A token supplier is a function of no arguments which delivers a new token |
244 | (together with its start and end positions) every time it is called. *) |
245 | |
246 | type supplier = |
247 | unit -> token * position * position |
248 | |
249 | (* A pair of a lexer and a lexing buffer can be easily turned into a supplier. *) |
250 | |
251 | val lexer_lexbuf_to_supplier: |
252 | (Lexing.lexbuf -> token) -> |
253 | Lexing.lexbuf -> |
254 | supplier |
255 | |
256 | (* The functions [offer] and [resume] are sufficient to write a parser loop. |
257 | One can imagine many variations (which is why we expose these functions |
258 | in the first place!). Here, we expose a few variations of the main loop, |
259 | ready for use. *) |
260 | |
261 | (* [loop supplier checkpoint] begins parsing from [checkpoint], reading |
262 | tokens from [supplier]. It continues parsing until it reaches a |
263 | checkpoint of the form [Accepted v] or [Rejected]. In the former case, it |
264 | returns [v]. In the latter case, it raises the exception [Error]. *) |
265 | |
266 | val loop: supplier -> 'a checkpoint -> 'a |
267 | |
268 | (* [loop_handle succeed fail supplier checkpoint] begins parsing from |
269 | [checkpoint], reading tokens from [supplier]. It continues parsing until |
270 | it reaches a checkpoint of the form [Accepted v] or [HandlingError env] |
271 | (or [Rejected], but that should not happen, as [HandlingError _] will be |
272 | observed first). In the former case, it calls [succeed v]. In the latter |
273 | case, it calls [fail] with this checkpoint. It cannot raise [Error]. |
274 | |
275 | This means that Menhir's traditional error-handling procedure (which pops |
276 | the stack until a state that can act on the [error] token is found) does |
277 | not get a chance to run. Instead, the user can implement her own error |
278 | handling code, in the [fail] continuation. *) |
279 | |
280 | val loop_handle: |
281 | ('a -> 'answer) -> |
282 | ('a checkpoint -> 'answer) -> |
283 | supplier -> 'a checkpoint -> 'answer |
284 | |
285 | (* [loop_handle_undo] is analogous to [loop_handle], except it passes a pair |
286 | of checkpoints to the failure continuation. |
287 | |
288 | The first (and oldest) checkpoint is the last [InputNeeded] checkpoint that |
289 | was encountered before the error was detected. The second (and newest) |
290 | checkpoint is where the error was detected, as in [loop_handle]. Going back |
291 | to the first checkpoint can be thought of as undoing any reductions that |
292 | were performed after seeing the problematic token. (These reductions must |
293 | be default reductions or spurious reductions.) |
294 | |
295 | [loop_handle_undo] must initially be applied to an [InputNeeded] checkpoint. |
296 | The parser's initial checkpoints satisfy this constraint. *) |
297 | |
298 | val loop_handle_undo: |
299 | ('a -> 'answer) -> |
300 | ('a checkpoint -> 'a checkpoint -> 'answer) -> |
301 | supplier -> 'a checkpoint -> 'answer |
302 | |
303 | (* [shifts checkpoint] assumes that [checkpoint] has been obtained by |
304 | submitting a token to the parser. It runs the parser from [checkpoint], |
305 | through an arbitrary number of reductions, until the parser either |
306 | accepts this token (i.e., shifts) or rejects it (i.e., signals an error). |
307 | If the parser decides to shift, then [Some env] is returned, where [env] |
308 | is the parser's state just before shifting. Otherwise, [None] is |
309 | returned. *) |
310 | |
311 | (* It is desirable that the semantic actions be side-effect free, or that |
312 | their side-effects be harmless (replayable). *) |
313 | |
314 | val shifts: 'a checkpoint -> 'a env option |
315 | |
316 | (* The function [acceptable] allows testing, after an error has been |
317 | detected, which tokens would have been accepted at this point. It is |
318 | implemented using [shifts]. Its argument should be an [InputNeeded] |
319 | checkpoint. *) |
320 | |
321 | (* For completeness, one must undo any spurious reductions before carrying out |
322 | this test -- that is, one must apply [acceptable] to the FIRST checkpoint |
323 | that is passed by [loop_handle_undo] to its failure continuation. *) |
324 | |
325 | (* This test causes some semantic actions to be run! The semantic actions |
326 | should be side-effect free, or their side-effects should be harmless. *) |
327 | |
328 | (* The position [pos] is used as the start and end positions of the |
329 | hypothetical token, and may be picked up by the semantic actions. We |
330 | suggest using the position where the error was detected. *) |
331 | |
332 | val acceptable: 'a checkpoint -> token -> position -> bool |
333 | |
334 | (* The abstract type ['a lr1state] describes the non-initial states of the |
335 | LR(1) automaton. The index ['a] represents the type of the semantic value |
336 | associated with this state's incoming symbol. *) |
337 | |
338 | type 'a lr1state |
339 | |
340 | (* The states of the LR(1) automaton are numbered (from 0 and up). *) |
341 | |
342 | val number: _ lr1state -> int |
343 | |
344 | (* Productions are numbered. *) |
345 | |
346 | (* [find_production i] requires the index [i] to be valid. Use with care. *) |
347 | |
348 | val production_index: production -> int |
349 | val find_production: int -> production |
350 | |
351 | (* An element is a pair of a non-initial state [s] and a semantic value [v] |
352 | associated with the incoming symbol of this state. The idea is, the value |
353 | [v] was pushed onto the stack just before the state [s] was entered. Thus, |
354 | for some type ['a], the state [s] has type ['a lr1state] and the value [v] |
355 | has type ['a]. In other words, the type [element] is an existential type. *) |
356 | |
357 | type element = |
358 | | Element: 'a lr1state * 'a * position * position -> element |
359 | |
360 | (* The parser's stack is (or, more precisely, can be viewed as) a stream of |
361 | elements. The type [stream] is defined by the module [General]. *) |
362 | |
363 | (* As of 2017/03/31, the types [stream] and [stack] and the function [stack] |
364 | are DEPRECATED. They might be removed in the future. An alternative way |
365 | of inspecting the stack is via the functions [top] and [pop]. *) |
366 | |
367 | type stack = (* DEPRECATED *) |
368 | element stream |
369 | |
370 | (* This is the parser's stack, a stream of elements. This stream is empty if |
371 | the parser is in an initial state; otherwise, it is non-empty. The LR(1) |
372 | automaton's current state is the one found in the top element of the |
373 | stack. *) |
374 | |
375 | val stack: 'a env -> stack (* DEPRECATED *) |
376 | |
377 | (* [top env] returns the parser's top stack element. The state contained in |
378 | this stack element is the current state of the automaton. If the stack is |
379 | empty, [None] is returned. In that case, the current state of the |
380 | automaton must be an initial state. *) |
381 | |
382 | val top: 'a env -> element option |
383 | |
384 | (* [pop_many i env] pops [i] cells off the automaton's stack. This is done |
385 | via [i] successive invocations of [pop]. Thus, [pop_many 1] is [pop]. The |
386 | index [i] must be nonnegative. The time complexity is O(i). *) |
387 | |
388 | val pop_many: int -> 'a env -> 'a env option |
389 | |
390 | (* [get i env] returns the parser's [i]-th stack element. The index [i] is |
391 | 0-based: thus, [get 0] is [top]. If [i] is greater than or equal to the |
392 | number of elements in the stack, [None] is returned. The time complexity |
393 | is O(i). *) |
394 | |
395 | val get: int -> 'a env -> element option |
396 | |
397 | (* [current_state_number env] is (the integer number of) the automaton's |
398 | current state. This works even if the automaton's stack is empty, in |
399 | which case the current state is an initial state. This number can be |
400 | passed as an argument to a [message] function generated by [menhir |
401 | --compile-errors]. *) |
402 | |
403 | val current_state_number: 'a env -> int |
404 | |
405 | (* [equal env1 env2] tells whether the parser configurations [env1] and |
406 | [env2] are equal in the sense that the automaton's current state is the |
407 | same in [env1] and [env2] and the stack is *physically* the same in |
408 | [env1] and [env2]. If [equal env1 env2] is [true], then the sequence of |
409 | the stack elements, as observed via [pop] and [top], must be the same in |
410 | [env1] and [env2]. Also, if [equal env1 env2] holds, then the checkpoints |
411 | [input_needed env1] and [input_needed env2] must be equivalent. The |
412 | function [equal] has time complexity O(1). *) |
413 | |
414 | val equal: 'a env -> 'a env -> bool |
415 | |
416 | (* These are the start and end positions of the current lookahead token. If |
417 | invoked in an initial state, this function returns a pair of twice the |
418 | initial position. *) |
419 | |
420 | val positions: 'a env -> position * position |
421 | |
422 | (* When applied to an environment taken from a checkpoint of the form |
423 | [AboutToReduce (env, prod)], the function [env_has_default_reduction] |
424 | tells whether the reduction that is about to take place is a default |
425 | reduction. *) |
426 | |
427 | val env_has_default_reduction: 'a env -> bool |
428 | |
429 | (* [state_has_default_reduction s] tells whether the state [s] has a default |
430 | reduction. This includes the case where [s] is an accepting state. *) |
431 | |
432 | val state_has_default_reduction: _ lr1state -> bool |
433 | |
434 | (* [pop env] returns a new environment, where the parser's top stack cell |
435 | has been popped off. (If the stack is empty, [None] is returned.) This |
436 | amounts to pretending that the (terminal or nonterminal) symbol that |
437 | corresponds to this stack cell has not been read. *) |
438 | |
439 | val pop: 'a env -> 'a env option |
440 | |
441 | (* [force_reduction prod env] should be called only if in the state [env] |
442 | the parser is capable of reducing the production [prod]. If this |
443 | condition is satisfied, then this production is reduced, which means that |
444 | its semantic action is executed (this can have side effects!) and the |
445 | automaton makes a goto (nonterminal) transition. If this condition is not |
446 | satisfied, [Invalid_argument _] is raised. *) |
447 | |
448 | val force_reduction: production -> 'a env -> 'a env |
449 | |
450 | (* [input_needed env] returns [InputNeeded env]. That is, out of an [env] |
451 | that might have been obtained via a series of calls to the functions |
452 | [pop], [force_reduction], [feed], etc., it produces a checkpoint, which |
453 | can be used to resume normal parsing, by supplying this checkpoint as an |
454 | argument to [offer]. *) |
455 | |
456 | (* This function should be used with some care. It could "mess up the |
457 | lookahead" in the sense that it allows parsing to resume in an arbitrary |
458 | state [s] with an arbitrary lookahead symbol [t], even though Menhir's |
459 | reachability analysis (menhir --list-errors) might well think that it is |
460 | impossible to reach this particular configuration. If one is using |
461 | Menhir's new error reporting facility, this could cause the parser to |
462 | reach an error state for which no error message has been prepared. *) |
463 | |
464 | val input_needed: 'a env -> 'a checkpoint |
465 | |
466 | end |
467 | |
468 | (* This signature is a fragment of the inspection API that is made available |
469 | to the user when [--inspection] is used. This fragment contains type |
470 | definitions for symbols. *) |
471 | |
472 | module type SYMBOLS = sig |
473 | |
474 | (* The type ['a terminal] represents a terminal symbol. The type ['a |
475 | nonterminal] represents a nonterminal symbol. In both cases, the index |
476 | ['a] represents the type of the semantic values associated with this |
477 | symbol. The concrete definitions of these types are generated. *) |
478 | |
479 | type 'a terminal |
480 | type 'a nonterminal |
481 | |
482 | (* The type ['a symbol] represents a terminal or nonterminal symbol. It is |
483 | the disjoint union of the types ['a terminal] and ['a nonterminal]. *) |
484 | |
485 | type 'a symbol = |
486 | | T : 'a terminal -> 'a symbol |
487 | | N : 'a nonterminal -> 'a symbol |
488 | |
489 | (* The type [xsymbol] is an existentially quantified version of the type |
490 | ['a symbol]. This type is useful in situations where the index ['a] |
491 | is not statically known. *) |
492 | |
493 | type xsymbol = |
494 | | X : 'a symbol -> xsymbol |
495 | |
496 | end |
497 | |
498 | (* This signature describes the inspection API that is made available to the |
499 | user when [--inspection] is used. *) |
500 | |
501 | module type INSPECTION = sig |
502 | |
503 | (* The types of symbols are described above. *) |
504 | |
505 | include SYMBOLS |
506 | |
507 | (* The type ['a lr1state] is meant to be the same as in [INCREMENTAL_ENGINE]. *) |
508 | |
509 | type 'a lr1state |
510 | |
511 | (* The type [production] is meant to be the same as in [INCREMENTAL_ENGINE]. |
512 | It represents a production of the grammar. A production can be examined |
513 | via the functions [lhs] and [rhs] below. *) |
514 | |
515 | type production |
516 | |
517 | (* An LR(0) item is a pair of a production [prod] and a valid index [i] into |
518 | this production. That is, if the length of [rhs prod] is [n], then [i] is |
519 | comprised between 0 and [n], inclusive. *) |
520 | |
521 | type item = |
522 | production * int |
523 | |
524 | (* Ordering functions. *) |
525 | |
526 | val compare_terminals: _ terminal -> _ terminal -> int |
527 | val compare_nonterminals: _ nonterminal -> _ nonterminal -> int |
528 | val compare_symbols: xsymbol -> xsymbol -> int |
529 | val compare_productions: production -> production -> int |
530 | val compare_items: item -> item -> int |
531 | |
532 | (* [incoming_symbol s] is the incoming symbol of the state [s], that is, |
533 | the symbol that the parser must recognize before (has recognized when) |
534 | it enters the state [s]. This function gives access to the semantic |
535 | value [v] stored in a stack element [Element (s, v, _, _)]. Indeed, |
536 | by case analysis on the symbol [incoming_symbol s], one discovers the |
537 | type ['a] of the value [v]. *) |
538 | |
539 | val incoming_symbol: 'a lr1state -> 'a symbol |
540 | |
541 | (* [items s] is the set of the LR(0) items in the LR(0) core of the LR(1) |
542 | state [s]. This set is not epsilon-closed. This set is presented as a |
543 | list, in an arbitrary order. *) |
544 | |
545 | val items: _ lr1state -> item list |
546 | |
547 | (* [lhs prod] is the left-hand side of the production [prod]. This is |
548 | always a non-terminal symbol. *) |
549 | |
550 | val lhs: production -> xsymbol |
551 | |
552 | (* [rhs prod] is the right-hand side of the production [prod]. This is |
553 | a (possibly empty) sequence of (terminal or nonterminal) symbols. *) |
554 | |
555 | val rhs: production -> xsymbol list |
556 | |
557 | (* [nullable nt] tells whether the non-terminal symbol [nt] is nullable. |
558 | That is, it is true if and only if this symbol produces the empty |
559 | word [epsilon]. *) |
560 | |
561 | val nullable: _ nonterminal -> bool |
562 | |
563 | (* [first nt t] tells whether the FIRST set of the nonterminal symbol [nt] |
564 | contains the terminal symbol [t]. That is, it is true if and only if |
565 | [nt] produces a word that begins with [t]. *) |
566 | |
567 | val first: _ nonterminal -> _ terminal -> bool |
568 | |
569 | (* [xfirst] is analogous to [first], but expects a first argument of type |
570 | [xsymbol] instead of [_ terminal]. *) |
571 | |
572 | val xfirst: xsymbol -> _ terminal -> bool |
573 | |
574 | (* [foreach_terminal] enumerates the terminal symbols, including [error]. |
575 | [foreach_terminal_but_error] enumerates the terminal symbols, excluding |
576 | [error]. *) |
577 | |
578 | val foreach_terminal: (xsymbol -> 'a -> 'a) -> 'a -> 'a |
579 | val foreach_terminal_but_error: (xsymbol -> 'a -> 'a) -> 'a -> 'a |
580 | |
581 | (* The type [env] is meant to be the same as in [INCREMENTAL_ENGINE]. *) |
582 | |
583 | type 'a env |
584 | |
585 | (* [feed symbol startp semv endp env] causes the parser to consume the |
586 | (terminal or nonterminal) symbol [symbol], accompanied with the semantic |
587 | value [semv] and with the start and end positions [startp] and [endp]. |
588 | Thus, the automaton makes a transition, and reaches a new state. The |
589 | stack grows by one cell. This operation is permitted only if the current |
590 | state (as determined by [env]) has an outgoing transition labeled with |
591 | [symbol]. Otherwise, [Invalid_argument _] is raised. *) |
592 | |
593 | val feed: 'a symbol -> position -> 'a -> position -> 'b env -> 'b env |
594 | |
595 | end |
596 | |
597 | (* This signature combines the incremental API and the inspection API. *) |
598 | |
599 | module type EVERYTHING = sig |
600 | |
601 | include INCREMENTAL_ENGINE |
602 | |
603 | include INSPECTION |
604 | with type 'a lr1state := 'a lr1state |
605 | with type production := production |
606 | with type 'a env := 'a env |
607 | |
608 | end |
609 | end |
610 | module EngineTypes : sig |
611 | (******************************************************************************) |
612 | (* *) |
613 | (* Menhir *) |
614 | (* *) |
615 | (* François Pottier, Inria Paris *) |
616 | (* Yann Régis-Gianas, PPS, Université Paris Diderot *) |
617 | (* *) |
618 | (* Copyright Inria. All rights reserved. This file is distributed under the *) |
619 | (* terms of the GNU Library General Public License version 2, with a *) |
620 | (* special exception on linking, as described in the file LICENSE. *) |
621 | (* *) |
622 | (******************************************************************************) |
623 | |
624 | (* This file defines several types and module types that are used in the |
625 | specification of module [Engine]. *) |
626 | |
627 | (* --------------------------------------------------------------------------- *) |
628 | |
629 | (* It would be nice if we could keep the structure of stacks and environments |
630 | hidden. However, stacks and environments must be accessible to semantic |
631 | actions, so the following data structure definitions must be public. *) |
632 | |
633 | (* --------------------------------------------------------------------------- *) |
634 | |
635 | (* A stack is a linked list of cells. A sentinel cell -- which is its own |
636 | successor -- is used to mark the bottom of the stack. The sentinel cell |
637 | itself is not significant -- it contains dummy values. *) |
638 | |
639 | type ('state, 'semantic_value) stack = { |
640 | |
641 | (* The state that we should go back to if we pop this stack cell. *) |
642 | |
643 | (* This convention means that the state contained in the top stack cell is |
644 | not the current state [env.current]. It also means that the state found |
645 | within the sentinel is a dummy -- it is never consulted. This convention |
646 | is the same as that adopted by the code-based back-end. *) |
647 | |
648 | state: 'state; |
649 | |
650 | (* The semantic value associated with the chunk of input that this cell |
651 | represents. *) |
652 | |
653 | semv: 'semantic_value; |
654 | |
655 | (* The start and end positions of the chunk of input that this cell |
656 | represents. *) |
657 | |
658 | startp: Lexing.position; |
659 | endp: Lexing.position; |
660 | |
661 | (* The next cell down in the stack. If this is a self-pointer, then this |
662 | cell is the sentinel, and the stack is conceptually empty. *) |
663 | |
664 | next: ('state, 'semantic_value) stack; |
665 | |
666 | } |
667 | |
668 | (* --------------------------------------------------------------------------- *) |
669 | |
670 | (* A parsing environment contains all of the parser's state (except for the |
671 | current program point). *) |
672 | |
673 | type ('state, 'semantic_value, 'token) env = { |
674 | |
675 | (* If this flag is true, then the first component of [env.triple] should |
676 | be ignored, as it has been logically overwritten with the [error] |
677 | pseudo-token. *) |
678 | |
679 | error: bool; |
680 | |
681 | (* The last token that was obtained from the lexer, together with its start |
682 | and end positions. Warning: before the first call to the lexer has taken |
683 | place, a dummy (and possibly invalid) token is stored here. *) |
684 | |
685 | triple: 'token * Lexing.position * Lexing.position; |
686 | |
687 | (* The stack. In [CodeBackend], it is passed around on its own, |
688 | whereas, here, it is accessed via the environment. *) |
689 | |
690 | stack: ('state, 'semantic_value) stack; |
691 | |
692 | (* The current state. In [CodeBackend], it is passed around on its |
693 | own, whereas, here, it is accessed via the environment. *) |
694 | |
695 | current: 'state; |
696 | |
697 | } |
698 | |
699 | (* --------------------------------------------------------------------------- *) |
700 | |
701 | (* This signature describes the parameters that must be supplied to the LR |
702 | engine. *) |
703 | |
704 | module type TABLE = sig |
705 | |
706 | (* The type of automaton states. *) |
707 | |
708 | type state |
709 | |
710 | (* States are numbered. *) |
711 | |
712 | val number: state -> int |
713 | |
714 | (* The type of tokens. These can be thought of as real tokens, that is, |
715 | tokens returned by the lexer. They carry a semantic value. This type |
716 | does not include the [error] pseudo-token. *) |
717 | |
718 | type token |
719 | |
720 | (* The type of terminal symbols. These can be thought of as integer codes. |
721 | They do not carry a semantic value. This type does include the [error] |
722 | pseudo-token. *) |
723 | |
724 | type terminal |
725 | |
726 | (* The type of nonterminal symbols. *) |
727 | |
728 | type nonterminal |
729 | |
730 | (* The type of semantic values. *) |
731 | |
732 | type semantic_value |
733 | |
734 | (* A token is conceptually a pair of a (non-[error]) terminal symbol and |
735 | a semantic value. The following two functions are the pair projections. *) |
736 | |
737 | val token2terminal: token -> terminal |
738 | val token2value: token -> semantic_value |
739 | |
740 | (* Even though the [error] pseudo-token is not a real token, it is a |
741 | terminal symbol. Furthermore, for regularity, it must have a semantic |
742 | value. *) |
743 | |
744 | val error_terminal: terminal |
745 | val error_value: semantic_value |
746 | |
747 | (* [foreach_terminal] allows iterating over all terminal symbols. *) |
748 | |
749 | val foreach_terminal: (terminal -> 'a -> 'a) -> 'a -> 'a |
750 | |
751 | (* The type of productions. *) |
752 | |
753 | type production |
754 | |
755 | val production_index: production -> int |
756 | val find_production: int -> production |
757 | |
758 | (* If a state [s] has a default reduction on production [prod], then, upon |
759 | entering [s], the automaton should reduce [prod] without consulting the |
760 | lookahead token. The following function allows determining which states |
761 | have default reductions. *) |
762 | |
763 | (* Instead of returning a value of a sum type -- either [DefRed prod], or |
764 | [NoDefRed] -- it accepts two continuations, and invokes just one of |
765 | them. This mechanism allows avoiding a memory allocation. *) |
766 | |
767 | val default_reduction: |
768 | state -> |
769 | ('env -> production -> 'answer) -> |
770 | ('env -> 'answer) -> |
771 | 'env -> 'answer |
772 | |
773 | (* An LR automaton can normally take three kinds of actions: shift, reduce, |
774 | or fail. (Acceptance is a particular case of reduction: it consists in |
775 | reducing a start production.) *) |
776 | |
777 | (* There are two variants of the shift action. [shift/discard s] instructs |
778 | the automaton to discard the current token, request a new one from the |
779 | lexer, and move to state [s]. [shift/nodiscard s] instructs it to move to |
780 | state [s] without requesting a new token. This instruction should be used |
781 | when [s] has a default reduction on [#]. See [CodeBackend.gettoken] for |
782 | details. *) |
783 | |
784 | (* This is the automaton's action table. It maps a pair of a state and a |
785 | terminal symbol to an action. *) |
786 | |
787 | (* Instead of returning a value of a sum type -- one of shift/discard, |
788 | shift/nodiscard, reduce, or fail -- this function accepts three |
789 | continuations, and invokes just one them. This mechanism allows avoiding |
790 | a memory allocation. *) |
791 | |
792 | (* In summary, the parameters to [action] are as follows: |
793 | |
794 | - the first two parameters, a state and a terminal symbol, are used to |
795 | look up the action table; |
796 | |
797 | - the next parameter is the semantic value associated with the above |
798 | terminal symbol; it is not used, only passed along to the shift |
799 | continuation, as explained below; |
800 | |
801 | - the shift continuation expects an environment; a flag that tells |
802 | whether to discard the current token; the terminal symbol that |
803 | is being shifted; its semantic value; and the target state of |
804 | the transition; |
805 | |
806 | - the reduce continuation expects an environment and a production; |
807 | |
808 | - the fail continuation expects an environment; |
809 | |
810 | - the last parameter is the environment; it is not used, only passed |
811 | along to the selected continuation. *) |
812 | |
813 | val action: |
814 | state -> |
815 | terminal -> |
816 | semantic_value -> |
817 | ('env -> bool -> terminal -> semantic_value -> state -> 'answer) -> |
818 | ('env -> production -> 'answer) -> |
819 | ('env -> 'answer) -> |
820 | 'env -> 'answer |
821 | |
822 | (* This is the automaton's goto table. This table maps a pair of a state |
823 | and a nonterminal symbol to a new state. By extension, it also maps a |
824 | pair of a state and a production to a new state. *) |
825 | |
826 | (* The function [goto_nt] can be applied to [s] and [nt] ONLY if the state |
827 | [s] has an outgoing transition labeled [nt]. Otherwise, its result is |
828 | undefined. Similarly, the call [goto_prod prod s] is permitted ONLY if |
829 | the state [s] has an outgoing transition labeled with the nonterminal |
830 | symbol [lhs prod]. The function [maybe_goto_nt] involves an additional |
831 | dynamic check and CAN be called even if there is no outgoing transition. *) |
832 | |
833 | val goto_nt : state -> nonterminal -> state |
834 | val goto_prod: state -> production -> state |
835 | val maybe_goto_nt: state -> nonterminal -> state option |
836 | |
837 | (* [is_start prod] tells whether the production [prod] is a start production. *) |
838 | |
839 | val is_start: production -> bool |
840 | |
841 | (* By convention, a semantic action is responsible for: |
842 | |
843 | 1. fetching whatever semantic values and positions it needs off the stack; |
844 | |
845 | 2. popping an appropriate number of cells off the stack, as dictated |
846 | by the length of the right-hand side of the production; |
847 | |
848 | 3. computing a new semantic value, as well as new start and end positions; |
849 | |
850 | 4. pushing a new stack cell, which contains the three values |
851 | computed in step 3; |
852 | |
853 | 5. returning the new stack computed in steps 2 and 4. |
854 | |
855 | Point 1 is essentially forced upon us: if semantic values were fetched |
856 | off the stack by this interpreter, then the calling convention for |
857 | semantic actions would be variadic: not all semantic actions would have |
858 | the same number of arguments. The rest follows rather naturally. *) |
859 | |
860 | (* Semantic actions are allowed to raise [Error]. *) |
861 | |
862 | exception Error |
863 | |
864 | type semantic_action = |
865 | (state, semantic_value, token) env -> (state, semantic_value) stack |
866 | |
867 | val semantic_action: production -> semantic_action |
868 | |
869 | (* [may_reduce state prod] tests whether the state [state] is capable of |
870 | reducing the production [prod]. This function is currently costly and |
871 | is not used by the core LR engine. It is used in the implementation |
872 | of certain functions, such as [force_reduction], which allow the engine |
873 | to be driven programmatically. *) |
874 | |
875 | val may_reduce: state -> production -> bool |
876 | |
877 | (* The LR engine requires a number of hooks, which are used for logging. *) |
878 | |
879 | (* The comments below indicate the conventional messages that correspond |
880 | to these hooks in the code-based back-end; see [CodeBackend]. *) |
881 | |
882 | (* If the flag [log] is false, then the logging functions are not called. |
883 | If it is [true], then they are called. *) |
884 | |
885 | val log : bool |
886 | |
887 | module Log : sig |
888 | |
889 | (* State %d: *) |
890 | |
891 | val state: state -> unit |
892 | |
893 | (* Shifting (<terminal>) to state <state> *) |
894 | |
895 | val shift: terminal -> state -> unit |
896 | |
897 | (* Reducing a production should be logged either as a reduction |
898 | event (for regular productions) or as an acceptance event (for |
899 | start productions). *) |
900 | |
901 | (* Reducing production <production> / Accepting *) |
902 | |
903 | val reduce_or_accept: production -> unit |
904 | |
905 | (* Lookahead token is now <terminal> (<pos>-<pos>) *) |
906 | |
907 | val lookahead_token: terminal -> Lexing.position -> Lexing.position -> unit |
908 | |
909 | (* Initiating error handling *) |
910 | |
911 | val initiating_error_handling: unit -> unit |
912 | |
913 | (* Resuming error handling *) |
914 | |
915 | val resuming_error_handling: unit -> unit |
916 | |
917 | (* Handling error in state <state> *) |
918 | |
919 | val handling_error: state -> unit |
920 | |
921 | end |
922 | |
923 | end |
924 | |
925 | (* --------------------------------------------------------------------------- *) |
926 | |
927 | (* This signature describes the monolithic (traditional) LR engine. *) |
928 | |
929 | (* In this interface, the parser controls the lexer. *) |
930 | |
931 | module type MONOLITHIC_ENGINE = sig |
932 | |
933 | type state |
934 | |
935 | type token |
936 | |
937 | type semantic_value |
938 | |
939 | (* An entry point to the engine requires a start state, a lexer, and a lexing |
940 | buffer. It either succeeds and produces a semantic value, or fails and |
941 | raises [Error]. *) |
942 | |
943 | exception Error |
944 | |
945 | val entry: |
946 | state -> |
947 | (Lexing.lexbuf -> token) -> |
948 | Lexing.lexbuf -> |
949 | semantic_value |
950 | |
951 | end |
952 | |
953 | (* --------------------------------------------------------------------------- *) |
954 | |
955 | (* The following signatures describe the incremental LR engine. *) |
956 | |
957 | (* First, see [INCREMENTAL_ENGINE] in the file [IncrementalEngine.ml]. *) |
958 | |
959 | (* The [start] function is set apart because we do not wish to publish |
960 | it as part of the generated [parser.mli] file. Instead, the table |
961 | back-end will publish specialized versions of it, with a suitable |
962 | type cast. *) |
963 | |
964 | module type INCREMENTAL_ENGINE_START = sig |
965 | |
966 | (* [start] is an entry point. It requires a start state and a start position |
967 | and begins the parsing process. If the lexer is based on an OCaml lexing |
968 | buffer, the start position should be [lexbuf.lex_curr_p]. [start] produces |
969 | a checkpoint, which usually will be an [InputNeeded] checkpoint. (It could |
970 | be [Accepted] if this starting state accepts only the empty word. It could |
971 | be [Rejected] if this starting state accepts no word at all.) It does not |
972 | raise any exception. *) |
973 | |
974 | (* [start s pos] should really produce a checkpoint of type ['a checkpoint], |
975 | for a fixed ['a] that depends on the state [s]. We cannot express this, so |
976 | we use [semantic_value checkpoint], which is safe. The table back-end uses |
977 | [Obj.magic] to produce safe specialized versions of [start]. *) |
978 | |
979 | type state |
980 | type semantic_value |
981 | type 'a checkpoint |
982 | |
983 | val start: |
984 | state -> |
985 | Lexing.position -> |
986 | semantic_value checkpoint |
987 | |
988 | end |
989 | |
990 | (* --------------------------------------------------------------------------- *) |
991 | |
992 | (* This signature describes the LR engine, which combines the monolithic |
993 | and incremental interfaces. *) |
994 | |
995 | module type ENGINE = sig |
996 | |
997 | include MONOLITHIC_ENGINE |
998 | |
999 | include IncrementalEngine.INCREMENTAL_ENGINE |
1000 | with type token := token |
1001 | and type 'a lr1state = state (* useful for us; hidden from the end user *) |
1002 | |
1003 | include INCREMENTAL_ENGINE_START |
1004 | with type state := state |
1005 | and type semantic_value := semantic_value |
1006 | and type 'a checkpoint := 'a checkpoint |
1007 | |
1008 | end |
1009 | end |
1010 | module Engine : sig |
1011 | (******************************************************************************) |
1012 | (* *) |
1013 | (* Menhir *) |
1014 | (* *) |
1015 | (* François Pottier, Inria Paris *) |
1016 | (* Yann Régis-Gianas, PPS, Université Paris Diderot *) |
1017 | (* *) |
1018 | (* Copyright Inria. All rights reserved. This file is distributed under the *) |
1019 | (* terms of the GNU Library General Public License version 2, with a *) |
1020 | (* special exception on linking, as described in the file LICENSE. *) |
1021 | (* *) |
1022 | (******************************************************************************) |
1023 | |
1024 | open EngineTypes |
1025 | |
1026 | (* The LR parsing engine. *) |
1027 | |
1028 | module Make (T : TABLE) |
1029 | : ENGINE |
1030 | with type state = T.state |
1031 | and type token = T.token |
1032 | and type semantic_value = T.semantic_value |
1033 | and type production = T.production |
1034 | and type 'a env = (T.state, T.semantic_value, T.token) EngineTypes.env |
1035 | |
1036 | (* We would prefer not to expose the definition of the type [env]. |
1037 | However, it must be exposed because some of the code in the |
1038 | inspection API needs access to the engine's internals; see |
1039 | [InspectionTableInterpreter]. Everything would be simpler if |
1040 | --inspection was always ON, but that would lead to bigger parse |
1041 | tables for everybody. *) |
1042 | end |
1043 | module ErrorReports : sig |
1044 | (******************************************************************************) |
1045 | (* *) |
1046 | (* Menhir *) |
1047 | (* *) |
1048 | (* François Pottier, Inria Paris *) |
1049 | (* Yann Régis-Gianas, PPS, Université Paris Diderot *) |
1050 | (* *) |
1051 | (* Copyright Inria. All rights reserved. This file is distributed under the *) |
1052 | (* terms of the GNU Library General Public License version 2, with a *) |
1053 | (* special exception on linking, as described in the file LICENSE. *) |
1054 | (* *) |
1055 | (******************************************************************************) |
1056 | |
1057 | (* -------------------------------------------------------------------------- *) |
1058 | |
1059 | (* The following functions help keep track of the start and end positions of |
1060 | the last two tokens in a two-place buffer. This is used to nicely display |
1061 | where a syntax error took place. *) |
1062 | |
1063 | type 'a buffer |
1064 | |
1065 | (* [wrap lexer] returns a pair of a new (initially empty) buffer and a lexer |
1066 | which internally relies on [lexer] and updates [buffer] on the fly whenever |
1067 | a token is demanded. *) |
1068 | |
1069 | open Lexing |
1070 | |
1071 | val wrap: |
1072 | (lexbuf -> 'token) -> |
1073 | (position * position) buffer * (lexbuf -> 'token) |
1074 | |
1075 | (* [show f buffer] prints the contents of the buffer, producing a string that |
1076 | is typically of the form "after '%s' and before '%s'". The function [f] is |
1077 | used to print an element. The buffer MUST be nonempty. *) |
1078 | |
1079 | val show: ('a -> string) -> 'a buffer -> string |
1080 | |
1081 | (* [last buffer] returns the last element of the buffer. The buffer MUST be |
1082 | nonempty. *) |
1083 | |
1084 | val last: 'a buffer -> 'a |
1085 | |
1086 | (* -------------------------------------------------------------------------- *) |
1087 | end |
1088 | module Printers : sig |
1089 | (******************************************************************************) |
1090 | (* *) |
1091 | (* Menhir *) |
1092 | (* *) |
1093 | (* François Pottier, Inria Paris *) |
1094 | (* Yann Régis-Gianas, PPS, Université Paris Diderot *) |
1095 | (* *) |
1096 | (* Copyright Inria. All rights reserved. This file is distributed under the *) |
1097 | (* terms of the GNU Library General Public License version 2, with a *) |
1098 | (* special exception on linking, as described in the file LICENSE. *) |
1099 | (* *) |
1100 | (******************************************************************************) |
1101 | |
1102 | (* This module is part of MenhirLib. *) |
1103 | |
1104 | module Make |
1105 | |
1106 | (I : IncrementalEngine.EVERYTHING) |
1107 | |
1108 | (User : sig |
1109 | |
1110 | (* [print s] is supposed to send the string [s] to some output channel. *) |
1111 | |
1112 | val print: string -> unit |
1113 | |
1114 | (* [print_symbol s] is supposed to print a representation of the symbol [s]. *) |
1115 | |
1116 | val print_symbol: I.xsymbol -> unit |
1117 | |
1118 | (* [print_element e] is supposed to print a representation of the element [e]. |
1119 | This function is optional; if it is not provided, [print_element_as_symbol] |
1120 | (defined below) is used instead. *) |
1121 | |
1122 | val print_element: (I.element -> unit) option |
1123 | |
1124 | end) |
1125 | |
1126 | : sig |
1127 | |
1128 | open I |
1129 | |
1130 | (* Printing a list of symbols. *) |
1131 | |
1132 | val print_symbols: xsymbol list -> unit |
1133 | |
1134 | (* Printing an element as a symbol. This prints just the symbol |
1135 | that this element represents; nothing more. *) |
1136 | |
1137 | val print_element_as_symbol: element -> unit |
1138 | |
1139 | (* Printing a stack as a list of elements. This function needs an element |
1140 | printer. It uses [print_element] if provided by the user; otherwise |
1141 | it uses [print_element_as_symbol]. (Ending with a newline.) *) |
1142 | |
1143 | val print_stack: 'a env -> unit |
1144 | |
1145 | (* Printing an item. (Ending with a newline.) *) |
1146 | |
1147 | val print_item: item -> unit |
1148 | |
1149 | (* Printing a production. (Ending with a newline.) *) |
1150 | |
1151 | val print_production: production -> unit |
1152 | |
1153 | (* Printing the current LR(1) state. The current state is first displayed |
1154 | as a number; then the list of its LR(0) items is printed. (Ending with |
1155 | a newline.) *) |
1156 | |
1157 | val print_current_state: 'a env -> unit |
1158 | |
1159 | (* Printing a summary of the stack and current state. This function just |
1160 | calls [print_stack] and [print_current_state] in succession. *) |
1161 | |
1162 | val print_env: 'a env -> unit |
1163 | |
1164 | end |
1165 | end |
1166 | module InfiniteArray : sig |
1167 | (******************************************************************************) |
1168 | (* *) |
1169 | (* Menhir *) |
1170 | (* *) |
1171 | (* François Pottier, Inria Paris *) |
1172 | (* Yann Régis-Gianas, PPS, Université Paris Diderot *) |
1173 | (* *) |
1174 | (* Copyright Inria. All rights reserved. This file is distributed under the *) |
1175 | (* terms of the GNU Library General Public License version 2, with a *) |
1176 | (* special exception on linking, as described in the file LICENSE. *) |
1177 | (* *) |
1178 | (******************************************************************************) |
1179 | |
1180 | (** This module implements infinite arrays. **) |
1181 | type 'a t |
1182 | |
1183 | (** [make x] creates an infinite array, where every slot contains [x]. **) |
1184 | val make: 'a -> 'a t |
1185 | |
1186 | (** [get a i] returns the element contained at offset [i] in the array [a]. |
1187 | Slots are numbered 0 and up. **) |
1188 | val get: 'a t -> int -> 'a |
1189 | |
1190 | (** [set a i x] sets the element contained at offset [i] in the array |
1191 | [a] to [x]. Slots are numbered 0 and up. **) |
1192 | val set: 'a t -> int -> 'a -> unit |
1193 | |
1194 | (** [extent a] is the length of an initial segment of the array [a] |
1195 | that is sufficiently large to contain all [set] operations ever |
1196 | performed. In other words, all elements beyond that segment have |
1197 | the default value. *) |
1198 | val extent: 'a t -> int |
1199 | |
1200 | (** [domain a] is a fresh copy of an initial segment of the array [a] |
1201 | whose length is [extent a]. *) |
1202 | val domain: 'a t -> 'a array |
1203 | end |
1204 | module PackedIntArray : sig |
1205 | (******************************************************************************) |
1206 | (* *) |
1207 | (* Menhir *) |
1208 | (* *) |
1209 | (* François Pottier, Inria Paris *) |
1210 | (* Yann Régis-Gianas, PPS, Université Paris Diderot *) |
1211 | (* *) |
1212 | (* Copyright Inria. All rights reserved. This file is distributed under the *) |
1213 | (* terms of the GNU Library General Public License version 2, with a *) |
1214 | (* special exception on linking, as described in the file LICENSE. *) |
1215 | (* *) |
1216 | (******************************************************************************) |
1217 | |
1218 | (* A packed integer array is represented as a pair of an integer [k] and |
1219 | a string [s]. The integer [k] is the number of bits per integer that we |
1220 | use. The string [s] is just an array of bits, which is read in 8-bit |
1221 | chunks. *) |
1222 | |
1223 | (* The ocaml programming language treats string literals and array literals |
1224 | in slightly different ways: the former are statically allocated, while |
1225 | the latter are dynamically allocated. (This is rather arbitrary.) In the |
1226 | context of Menhir's table-based back-end, where compact, immutable |
1227 | integer arrays are needed, ocaml strings are preferable to ocaml arrays. *) |
1228 | |
1229 | type t = |
1230 | int * string |
1231 | |
1232 | (* [pack a] turns an array of integers into a packed integer array. *) |
1233 | |
1234 | (* Because the sign bit is the most significant bit, the magnitude of |
1235 | any negative number is the word size. In other words, [pack] does |
1236 | not achieve any space savings as soon as [a] contains any negative |
1237 | numbers, even if they are ``small''. *) |
1238 | |
1239 | val pack: int array -> t |
1240 | |
1241 | (* [get t i] returns the integer stored in the packed array [t] at index [i]. *) |
1242 | |
1243 | (* Together, [pack] and [get] satisfy the following property: if the index [i] |
1244 | is within bounds, then [get (pack a) i] equals [a.(i)]. *) |
1245 | |
1246 | val get: t -> int -> int |
1247 | |
1248 | (* [get1 t i] returns the integer stored in the packed array [t] at index [i]. |
1249 | It assumes (and does not check) that the array's bit width is [1]. The |
1250 | parameter [t] is just a string. *) |
1251 | |
1252 | val get1: string -> int -> int |
1253 | |
1254 | (* [unflatten1 (n, data) i j] accesses the two-dimensional bitmap |
1255 | represented by [(n, data)] at indices [i] and [j]. The integer |
1256 | [n] is the width of the bitmap; the string [data] is the second |
1257 | component of the packed array obtained by encoding the table as |
1258 | a one-dimensional array. *) |
1259 | |
1260 | val unflatten1: int * string -> int -> int -> int |
1261 | |
1262 | end |
1263 | module RowDisplacement : sig |
1264 | (******************************************************************************) |
1265 | (* *) |
1266 | (* Menhir *) |
1267 | (* *) |
1268 | (* François Pottier, Inria Paris *) |
1269 | (* Yann Régis-Gianas, PPS, Université Paris Diderot *) |
1270 | (* *) |
1271 | (* Copyright Inria. All rights reserved. This file is distributed under the *) |
1272 | (* terms of the GNU Library General Public License version 2, with a *) |
1273 | (* special exception on linking, as described in the file LICENSE. *) |
1274 | (* *) |
1275 | (******************************************************************************) |
1276 | |
1277 | (* This module compresses a two-dimensional table, where some values |
1278 | are considered insignificant, via row displacement. *) |
1279 | |
1280 | (* A compressed table is represented as a pair of arrays. The |
1281 | displacement array is an array of offsets into the data array. *) |
1282 | |
1283 | type 'a table = |
1284 | int array * (* displacement *) |
1285 | 'a array (* data *) |
1286 | |
1287 | (* [compress equal insignificant dummy m n t] turns the two-dimensional table |
1288 | [t] into a compressed table. The parameter [equal] is equality of data |
1289 | values. The parameter [wildcard] tells which data values are insignificant, |
1290 | and can thus be overwritten with other values. The parameter [dummy] is |
1291 | used to fill holes in the data array. [m] and [n] are the integer |
1292 | dimensions of the table [t]. *) |
1293 | |
1294 | val compress: |
1295 | ('a -> 'a -> bool) -> |
1296 | ('a -> bool) -> |
1297 | 'a -> |
1298 | int -> int -> |
1299 | 'a array array -> |
1300 | 'a table |
1301 | |
1302 | (* [get ct i j] returns the value found at indices [i] and [j] in the |
1303 | compressed table [ct]. This function call is permitted only if the |
1304 | value found at indices [i] and [j] in the original table is |
1305 | significant -- otherwise, it could fail abruptly. *) |
1306 | |
1307 | (* Together, [compress] and [get] have the property that, if the value |
1308 | found at indices [i] and [j] in an uncompressed table [t] is |
1309 | significant, then [get (compress t) i j] is equal to that value. *) |
1310 | |
1311 | val get: |
1312 | 'a table -> |
1313 | int -> int -> |
1314 | 'a |
1315 | |
1316 | (* [getget] is a variant of [get] which only requires read access, |
1317 | via accessors, to the two components of the table. *) |
1318 | |
1319 | val getget: |
1320 | ('displacement -> int -> int) -> |
1321 | ('data -> int -> 'a) -> |
1322 | 'displacement * 'data -> |
1323 | int -> int -> |
1324 | 'a |
1325 | |
1326 | end |
1327 | module LinearizedArray : sig |
1328 | (******************************************************************************) |
1329 | (* *) |
1330 | (* Menhir *) |
1331 | (* *) |
1332 | (* François Pottier, Inria Paris *) |
1333 | (* Yann Régis-Gianas, PPS, Université Paris Diderot *) |
1334 | (* *) |
1335 | (* Copyright Inria. All rights reserved. This file is distributed under the *) |
1336 | (* terms of the GNU Library General Public License version 2, with a *) |
1337 | (* special exception on linking, as described in the file LICENSE. *) |
1338 | (* *) |
1339 | (******************************************************************************) |
1340 | |
1341 | (* An array of arrays (of possibly different lengths!) can be ``linearized'', |
1342 | i.e., encoded as a data array (by concatenating all of the little arrays) |
1343 | and an entry array (which contains offsets into the data array). *) |
1344 | |
1345 | type 'a t = |
1346 | (* data: *) 'a array * |
1347 | (* entry: *) int array |
1348 | |
1349 | (* [make a] turns the array of arrays [a] into a linearized array. *) |
1350 | |
1351 | val make: 'a array array -> 'a t |
1352 | |
1353 | (* [read la i j] reads the linearized array [la] at indices [i] and [j]. |
1354 | Thus, [read (make a) i j] is equivalent to [a.(i).(j)]. *) |
1355 | |
1356 | val read: 'a t -> int -> int -> 'a |
1357 | |
1358 | (* [write la i j v] writes the value [v] into the linearized array [la] |
1359 | at indices [i] and [j]. *) |
1360 | |
1361 | val write: 'a t -> int -> int -> 'a -> unit |
1362 | |
1363 | (* [length la] is the number of rows of the array [la]. Thus, [length (make |
1364 | a)] is equivalent to [Array.length a]. *) |
1365 | |
1366 | val length: 'a t -> int |
1367 | |
1368 | (* [row_length la i] is the length of the row at index [i] in the linearized |
1369 | array [la]. Thus, [row_length (make a) i] is equivalent to [Array.length |
1370 | a.(i)]. *) |
1371 | |
1372 | val row_length: 'a t -> int -> int |
1373 | |
1374 | (* [read_row la i] reads the row at index [i], producing a list. Thus, |
1375 | [read_row (make a) i] is equivalent to [Array.to_list a.(i)]. *) |
1376 | |
1377 | val read_row: 'a t -> int -> 'a list |
1378 | |
1379 | (* The following variants read the linearized array via accessors |
1380 | [get_data : int -> 'a] and [get_entry : int -> int]. *) |
1381 | |
1382 | val row_length_via: |
1383 | (* get_entry: *) (int -> int) -> |
1384 | (* i: *) int -> |
1385 | int |
1386 | |
1387 | val read_via: |
1388 | (* get_data: *) (int -> 'a) -> |
1389 | (* get_entry: *) (int -> int) -> |
1390 | (* i: *) int -> |
1391 | (* j: *) int -> |
1392 | 'a |
1393 | |
1394 | val read_row_via: |
1395 | (* get_data: *) (int -> 'a) -> |
1396 | (* get_entry: *) (int -> int) -> |
1397 | (* i: *) int -> |
1398 | 'a list |
1399 | |
1400 | end |
1401 | module TableFormat : sig |
1402 | (******************************************************************************) |
1403 | (* *) |
1404 | (* Menhir *) |
1405 | (* *) |
1406 | (* François Pottier, Inria Paris *) |
1407 | (* Yann Régis-Gianas, PPS, Université Paris Diderot *) |
1408 | (* *) |
1409 | (* Copyright Inria. All rights reserved. This file is distributed under the *) |
1410 | (* terms of the GNU Library General Public License version 2, with a *) |
1411 | (* special exception on linking, as described in the file LICENSE. *) |
1412 | (* *) |
1413 | (******************************************************************************) |
1414 | |
1415 | (* This signature defines the format of the parse tables. It is used as |
1416 | an argument to [TableInterpreter.Make]. *) |
1417 | |
1418 | module type TABLES = sig |
1419 | |
1420 | (* This is the parser's type of tokens. *) |
1421 | |
1422 | type token |
1423 | |
1424 | (* This maps a token to its internal (generation-time) integer code. *) |
1425 | |
1426 | val token2terminal: token -> int |
1427 | |
1428 | (* This is the integer code for the error pseudo-token. *) |
1429 | |
1430 | val error_terminal: int |
1431 | |
1432 | (* This maps a token to its semantic value. *) |
1433 | |
1434 | val token2value: token -> Obj.t |
1435 | |
1436 | (* Traditionally, an LR automaton is described by two tables, namely, an |
1437 | action table and a goto table. See, for instance, the Dragon book. |
1438 | |
1439 | The action table is a two-dimensional matrix that maps a state and a |
1440 | lookahead token to an action. An action is one of: shift to a certain |
1441 | state, reduce a certain production, accept, or fail. |
1442 | |
1443 | The goto table is a two-dimensional matrix that maps a state and a |
1444 | non-terminal symbol to either a state or undefined. By construction, this |
1445 | table is sparse: its undefined entries are never looked up. A compression |
1446 | technique is free to overlap them with other entries. |
1447 | |
1448 | In Menhir, things are slightly different. If a state has a default |
1449 | reduction on token [#], then that reduction must be performed without |
1450 | consulting the lookahead token. As a result, we must first determine |
1451 | whether that is the case, before we can obtain a lookahead token and use it |
1452 | as an index in the action table. |
1453 | |
1454 | Thus, Menhir's tables are as follows. |
1455 | |
1456 | A one-dimensional default reduction table maps a state to either ``no |
1457 | default reduction'' (encoded as: 0) or ``by default, reduce prod'' |
1458 | (encoded as: 1 + prod). The action table is looked up only when there |
1459 | is no default reduction. *) |
1460 | |
1461 | val default_reduction: PackedIntArray.t |
1462 | |
1463 | (* Menhir follows Dencker, Dürre and Heuft, who point out that, although the |
1464 | action table is not sparse by nature (i.e., the error entries are |
1465 | significant), it can be made sparse by first factoring out a binary error |
1466 | matrix, then replacing the error entries in the action table with undefined |
1467 | entries. Thus: |
1468 | |
1469 | A two-dimensional error bitmap maps a state and a terminal to either |
1470 | ``fail'' (encoded as: 0) or ``do not fail'' (encoded as: 1). The action |
1471 | table, which is now sparse, is looked up only in the latter case. *) |
1472 | |
1473 | (* The error bitmap is flattened into a one-dimensional table; its width is |
1474 | recorded so as to allow indexing. The table is then compressed via |
1475 | [PackedIntArray]. The bit width of the resulting packed array must be |
1476 | [1], so it is not explicitly recorded. *) |
1477 | |
1478 | (* The error bitmap does not contain a column for the [#] pseudo-terminal. |
1479 | Thus, its width is [Terminal.n - 1]. We exploit the fact that the integer |
1480 | code assigned to [#] is greatest: the fact that the right-most column |
1481 | in the bitmap is missing does not affect the code for accessing it. *) |
1482 | |
1483 | val error: int (* width of the bitmap *) * string (* second component of [PackedIntArray.t] *) |
1484 | |
1485 | (* A two-dimensional action table maps a state and a terminal to one of |
1486 | ``shift to state s and discard the current token'' (encoded as: s | 10), |
1487 | ``shift to state s without discarding the current token'' (encoded as: s | |
1488 | 11), or ``reduce prod'' (encoded as: prod | 01). *) |
1489 | |
1490 | (* The action table is first compressed via [RowDisplacement], then packed |
1491 | via [PackedIntArray]. *) |
1492 | |
1493 | (* Like the error bitmap, the action table does not contain a column for the |
1494 | [#] pseudo-terminal. *) |
1495 | |
1496 | val action: PackedIntArray.t * PackedIntArray.t |
1497 | |
1498 | (* A one-dimensional lhs table maps a production to its left-hand side (a |
1499 | non-terminal symbol). *) |
1500 | |
1501 | val lhs: PackedIntArray.t |
1502 | |
1503 | (* A two-dimensional goto table maps a state and a non-terminal symbol to |
1504 | either undefined (encoded as: 0) or a new state s (encoded as: 1 + s). *) |
1505 | |
1506 | (* The goto table is first compressed via [RowDisplacement], then packed |
1507 | via [PackedIntArray]. *) |
1508 | |
1509 | val goto: PackedIntArray.t * PackedIntArray.t |
1510 | |
1511 | (* The number of start productions. A production [prod] is a start |
1512 | production if and only if [prod < start] holds. This is also the |
1513 | number of start symbols. A nonterminal symbol [nt] is a start |
1514 | symbol if and only if [nt < start] holds. *) |
1515 | |
1516 | val start: int |
1517 | |
1518 | (* A one-dimensional semantic action table maps productions to semantic |
1519 | actions. The calling convention for semantic actions is described in |
1520 | [EngineTypes]. This table contains ONLY NON-START PRODUCTIONS, so the |
1521 | indexing is off by [start]. Be careful. *) |
1522 | |
1523 | val semantic_action: ((int, Obj.t, token) EngineTypes.env -> |
1524 | (int, Obj.t) EngineTypes.stack) array |
1525 | |
1526 | (* The parser defines its own [Error] exception. This exception can be |
1527 | raised by semantic actions and caught by the engine, and raised by the |
1528 | engine towards the final user. *) |
1529 | |
1530 | exception Error |
1531 | |
1532 | (* The parser indicates whether to generate a trace. Generating a |
1533 | trace requires two extra tables, which respectively map a |
1534 | terminal symbol and a production to a string. *) |
1535 | |
1536 | val trace: (string array * string array) option |
1537 | |
1538 | end |
1539 | end |
1540 | module InspectionTableFormat : sig |
1541 | (******************************************************************************) |
1542 | (* *) |
1543 | (* Menhir *) |
1544 | (* *) |
1545 | (* François Pottier, Inria Paris *) |
1546 | (* Yann Régis-Gianas, PPS, Université Paris Diderot *) |
1547 | (* *) |
1548 | (* Copyright Inria. All rights reserved. This file is distributed under the *) |
1549 | (* terms of the GNU Library General Public License version 2, with a *) |
1550 | (* special exception on linking, as described in the file LICENSE. *) |
1551 | (* *) |
1552 | (******************************************************************************) |
1553 | |
1554 | (* This signature defines the format of the tables that are produced (in |
1555 | addition to the tables described in [TableFormat]) when the command line |
1556 | switch [--inspection] is enabled. It is used as an argument to |
1557 | [InspectionTableInterpreter.Make]. *) |
1558 | |
1559 | module type TABLES = sig |
1560 | |
1561 | (* The types of symbols. *) |
1562 | |
1563 | include IncrementalEngine.SYMBOLS |
1564 | |
1565 | (* The type ['a lr1state] describes an LR(1) state. The generated parser defines |
1566 | it internally as [int]. *) |
1567 | |
1568 | type 'a lr1state |
1569 | |
1570 | (* Some of the tables that follow use encodings of (terminal and |
1571 | nonterminal) symbols as integers. So, we need functions that |
1572 | map the integer encoding of a symbol to its algebraic encoding. *) |
1573 | |
1574 | val terminal: int -> xsymbol |
1575 | val nonterminal: int -> xsymbol |
1576 | |
1577 | (* The left-hand side of every production already appears in the |
1578 | signature [TableFormat.TABLES], so we need not repeat it here. *) |
1579 | |
1580 | (* The right-hand side of every production. This a linearized array |
1581 | of arrays of integers, whose [data] and [entry] components have |
1582 | been packed. The encoding of symbols as integers in described in |
1583 | [TableBackend]. *) |
1584 | |
1585 | val rhs: PackedIntArray.t * PackedIntArray.t |
1586 | |
1587 | (* A mapping of every (non-initial) state to its LR(0) core. *) |
1588 | |
1589 | val lr0_core: PackedIntArray.t |
1590 | |
1591 | (* A mapping of every LR(0) state to its set of LR(0) items. Each item is |
1592 | represented in its packed form (see [Item]) as an integer. Thus the |
1593 | mapping is an array of arrays of integers, which is linearized and |
1594 | packed, like [rhs]. *) |
1595 | |
1596 | val lr0_items: PackedIntArray.t * PackedIntArray.t |
1597 | |
1598 | (* A mapping of every LR(0) state to its incoming symbol, if it has one. *) |
1599 | |
1600 | val lr0_incoming: PackedIntArray.t |
1601 | |
1602 | (* A table that tells which non-terminal symbols are nullable. *) |
1603 | |
1604 | val nullable: string |
1605 | (* This is a packed int array of bit width 1. It can be read |
1606 | using [PackedIntArray.get1]. *) |
1607 | |
1608 | (* A two-table dimensional table, indexed by a nonterminal symbol and |
1609 | by a terminal symbol (other than [#]), encodes the FIRST sets. *) |
1610 | |
1611 | val first: int (* width of the bitmap *) * string (* second component of [PackedIntArray.t] *) |
1612 | |
1613 | end |
1614 | |
1615 | end |
1616 | module InspectionTableInterpreter : sig |
1617 | (******************************************************************************) |
1618 | (* *) |
1619 | (* Menhir *) |
1620 | (* *) |
1621 | (* François Pottier, Inria Paris *) |
1622 | (* Yann Régis-Gianas, PPS, Université Paris Diderot *) |
1623 | (* *) |
1624 | (* Copyright Inria. All rights reserved. This file is distributed under the *) |
1625 | (* terms of the GNU Library General Public License version 2, with a *) |
1626 | (* special exception on linking, as described in the file LICENSE. *) |
1627 | (* *) |
1628 | (******************************************************************************) |
1629 | |
1630 | (* This functor is invoked inside the generated parser, in [--table] mode. It |
1631 | produces no code! It simply constructs the types [symbol] and [xsymbol] on |
1632 | top of the generated types [terminal] and [nonterminal]. *) |
1633 | |
1634 | module Symbols (T : sig |
1635 | |
1636 | type 'a terminal |
1637 | type 'a nonterminal |
1638 | |
1639 | end) |
1640 | |
1641 | : IncrementalEngine.SYMBOLS |
1642 | with type 'a terminal := 'a T.terminal |
1643 | and type 'a nonterminal := 'a T.nonterminal |
1644 | |
1645 | (* This functor is invoked inside the generated parser, in [--table] mode. It |
1646 | constructs the inspection API on top of the inspection tables described in |
1647 | [InspectionTableFormat]. *) |
1648 | |
1649 | module Make |
1650 | (TT : TableFormat.TABLES) |
1651 | (IT : InspectionTableFormat.TABLES |
1652 | with type 'a lr1state = int) |
1653 | (ET : EngineTypes.TABLE |
1654 | with type terminal = int |
1655 | and type nonterminal = int |
1656 | and type semantic_value = Obj.t) |
1657 | (E : sig |
1658 | type 'a env = (ET.state, ET.semantic_value, ET.token) EngineTypes.env |
1659 | end) |
1660 | |
1661 | : IncrementalEngine.INSPECTION |
1662 | with type 'a terminal := 'a IT.terminal |
1663 | and type 'a nonterminal := 'a IT.nonterminal |
1664 | and type 'a lr1state := 'a IT.lr1state |
1665 | and type production := int |
1666 | and type 'a env := 'a E.env |
1667 | end |
1668 | module TableInterpreter : sig |
1669 | (******************************************************************************) |
1670 | (* *) |
1671 | (* Menhir *) |
1672 | (* *) |
1673 | (* François Pottier, Inria Paris *) |
1674 | (* Yann Régis-Gianas, PPS, Université Paris Diderot *) |
1675 | (* *) |
1676 | (* Copyright Inria. All rights reserved. This file is distributed under the *) |
1677 | (* terms of the GNU Library General Public License version 2, with a *) |
1678 | (* special exception on linking, as described in the file LICENSE. *) |
1679 | (* *) |
1680 | (******************************************************************************) |
1681 | |
1682 | (* This module provides a thin decoding layer for the generated tables, thus |
1683 | providing an API that is suitable for use by [Engine.Make]. It is part of |
1684 | [MenhirLib]. *) |
1685 | |
1686 | (* The exception [Error] is declared within the generated parser. This is |
1687 | preferable to pre-declaring it here, as it ensures that each parser gets |
1688 | its own, distinct [Error] exception. This is consistent with the code-based |
1689 | back-end. *) |
1690 | |
1691 | (* This functor is invoked by the generated parser. *) |
1692 | |
1693 | module MakeEngineTable |
1694 | (T : TableFormat.TABLES) |
1695 | : EngineTypes.TABLE |
1696 | with type state = int |
1697 | and type token = T.token |
1698 | and type semantic_value = Obj.t |
1699 | and type production = int |
1700 | and type terminal = int |
1701 | and type nonterminal = int |
1702 | end |
1703 | module StaticVersion : sig |
1704 | val require_20181113 : unit |
1705 | end |
1706 |