123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138(* Copyright (C) Gabriel Hondet.
Subject to the BSD-3-Clause license *)(** Transform strings of tokens and mixfix operators into full binary trees.
Operators are characterised by their associativity and their fixity.
To parse expressions of type ['a], you need to tell the parser
- how to concatenate two expressions with a function of type
['a -> 'a -> 'a]; this function can be seen as the concatenation of two
binary trees (and in that case, the input of the parser is a string of
leaves);
- how to determine whether a value of ['a] should be considered as an
operator.
The algorithm implemented is an extension of the Pratt parser. The Shunting
Yard algorithm could also be used.
@see <https://matklad.github.io/2020/04/13/simple-but-powerful-pratt-parsing.html>
@see <https://dev.to/jrop/pratt-parsing> *)(** Associativity of an operator. *)typeassociativity=|Left(** If [+] is a left associative operator, [x + y + z] is parsed [(x +
y) + z]. *)|Right(** If [+] is a right associative operator, [x + y + z] is parsed [x +
(y + z)]. *)|Neither(** If [+] is not associative, then [(x + y) + z] is not [x + (y + z)] and
[x + y + z] results in a syntax error. *)(** The fixity allows to determine where the arguments of an operator are. *)typefixity=|Infixofassociativity(** The operator is between its arguments, such as [=] in [x = y]. *)|Prefix(** The operator is before its argument, such as [¬] in [¬ P]. *)|Postfix(** The operator is after its argument, such as [²] in [x²]. *)type'terror=[`OpConflictof't*'t(** Priority or associativiy conflict between two operators.
In [`OpConflict (t,o)], [o] is an operator which generates a conflict
preventing term [t] to be parsed. *)|`UnexpectedInfixof't(** An infix operator appears without left context. If [+] is an
infix operator, it is raised in, e.g., [+ x x] or [x + + x x]. *)|`UnexpectedPostfixof't(** A postfix operator appears without left context. If [!] is a
postfix operator, it is raised in [! x]. *)|`TooFewArguments(** More arguments are expected. It is raised for instance on
partial application of operators, such as [x +]. *)](** Errors that can be encountered while parsing a stream of terms. *)(** [expression appl is_op s] parses the stream of tokens [s] and turns it into
a full binary tree.
If tokens are seen as leaves of binary trees, the function [appl] is the
concatenation of two binary trees. If tokens are seen as terms, [appl]
is the application.
The function [is_op] is in charge of specifying which tokens are operators:
for any term [t], [is_op t] must return [Some (f, p)] whenever [t] is an
operator with fixity [f] and binding power (or precedence) [p]. If [t]
isn't an operator, [is_op] should return [None].
For instance, assuming that [+] is declared infix and we're working with
numbers, it can transform [3 + 5 × 2] encoded as the stream of terms [3, +,
5, ×, 2] into the binary tree [@(@(×,@(@(+,3),5)),2)] where [@] denotes
nodes. *)letexpression:appl:('a->'a->'a)->is_op:('a->(fixity*float)option)->'aStream.t->('a,'aerror)result=fun~appl~is_op->(* [nud tbl strm t] is the production of term [t] with {b no} left context.
If [t] is not a prefix operator, [nud] is the identity. Otherwise, the
output is a production rule. *)letrecnudstrmt=matchis_optwith|Some(Prefix,rbp)->Result.map(applt)(expression~rbp~rassoc:Neitherstrm)|Some(Infix_,_)->Error(`UnexpectedInfixt)(* If the line above is erased, [+ x x] is parsed as [(+ x) x], and
[x + + x x] as [(+ x) ((+ x) x)]. *)|Some(Postfix,_)->Error(`UnexpectedPostfixt)|_->Okt(* [led ~strm ~left t assoc bp] is the production of term [t] with
left context [left]. We have the invariant that [t] is a binary operator
with associativity [assoc] and binding power [bp]. This invariant is
ensured while called in {!val:expression}. *)andled~strm~lefttassocbp=letrbp=matchassocwith|Right->bp*.(1.-.epsilon_float)|Left|Neither->bpinResult.map(appl(appltleft))(expression~rbp~rassoc:assocstrm)(* [expression ~rbp ~rassoc strm] parses next token of stream
[strm] with previous operator having a right binding power [~rbp] and
associativity [~rassoc]. *)andexpression~rbp~rassocstrm=(* [aux left] inspects the stream and may consume one of its elements, or
return [left] unchanged. *)letrecaux(left:'a)=matchStream.peekstrmwith|None->Okleft|Somept->(matchis_opptwith|Some(Infixlassoc,lbp)->iflbp>rbp||(lbp=rbp&&lassoc=Right&&rassoc=Right)then(* Performed before to execute side effect on stream. *)letnext=Stream.nextstrminResult.bind(led~strm~leftnextlassoclbp)auxelseiflbp<rbp||(lbp=rbp&&lassoc=Left&&rassoc=Left)thenOkleftelseError(`OpConflict(left,pt))|Some(Postfix,lbp)->iflbp>rbpthenletnext=Stream.nextstrminaux(applnextleft)elseiflbp=rbpthenError(`OpConflict(left,pt))elseOkleft|Some(Prefix,_)|None->(* argument of an application *)letnext=Stream.nextstrminResult.bind(nudstrmnext)(funright->aux(applleftright)))intryletnext=Stream.nextstrminletleft=nudstrmnextinResult.bindleftauxwithStream.Failure->Error`TooFewArgumentsinexpression~rbp:neg_infinity~rassoc:Neither