123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247(*
* Copyright (c) 2025 Daniil Baturin
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*)exceptionGrammar_errorofstringtypesymbol_flags={(* The symbol itself is marked sticky in the grammar. *)sticky:bool;(* The symbol must be rendered as sticky
because it was produced during reduction of a sticky parent.
The reason not to use the normal sticky flag
is to be able to render rules for debugging purposes
exactly as they were in the source grammar.
The reason why we need that flag propagation at all
is that symbol reduction is tail-recursive
to support infinite depth and output size,
so there is no other way to determine if we need
to emit a separator or not.
In a naively recursive implementation,
we could just check for stickyness before return,
but in a tail-recursive implementation,
we have to pass that state around.
*)sticky_parent:bool;}typesymbol=|Terminalof(string*symbol_flags)|Nonterminalof(string*symbol_flags)|Repeatofsymbol*(int*int)typerule_alternative={weight:int;symbols:symbollist}typerule=string*(rule_alternativelist)typegrammar=rulelistletdefault_flags={sticky=false;sticky_parent=false}letgrammar_errors=raise(Grammar_errors)(* Anything to string, mainly for parser debug *)letsticky_flagsym_data=(* Only render the sticky flag if it was originally there
and was not added during reduction of a sticky parent. *)ifsym_data.stickythen"~"else""letrecstring_of_symbols=matchswith|Terminal(content,sym_data)->Printf.sprintf"\"%s\"%s"content(sticky_flagsym_data)|Nonterminal(name,sym_data)->Printf.sprintf"<%s>%s"name(sticky_flagsym_data)|Repeat(s,(min,max))->if(min=max)thenPrintf.sprintf"%s{%d}"(string_of_symbols)minelsePrintf.sprintf"%s{%d-%d}"(string_of_symbols)minmaxletstring_of_rule_rhs_partr=letl=List.mapstring_of_symbolr.symbolsinletsym_str=String.concat" "linletweight_str=ifr.weight=1then""elsestring_of_intr.weightinString.concat" "[weight_str;sym_str]letrecstring_of_rule_rhsr=matchrwith|[]->""|[hd]->string_of_rule_rhs_parthd|hd::tl->Printf.sprintf"%s | %s"(string_of_rule_rhs_parthd)(string_of_rule_rhstl)letstring_of_ruler=let(name,alts)=rinPrintf.sprintf"%s ::= %s;"(string_of_symbol(Nonterminal(name,default_flags)))(string_of_rule_rhsalts)letto_stringr=letrule_str_list=List.mapstring_of_rulerinString.concat"\n"rule_str_list(* Rule sanity checking.
Making illegal states unrepresentable would require dependent types,
so we cannot avoid runtime checks.
The parser takes care of some but not all of these errors,
so it's a good idea to run check_grammar on all grammars.
*)letcheck_for_nonexistentrs=letget_left_hand_sidesrs=List.map(fun(name,_)->name)rsinletget_referenced_nonterminalsr=let(_,alts)=rinletsymbols=alts|>List.map(funr->r.symbols)|>List.concatinList.fold_left(funsss->matchswithNonterminal(s',_)->s'::ss|_->ss)[]symbolsinletcheck_symbolknown_symbolsname=tryignore@@List.find((=)name)known_symbolswith_->Printf.ksprintfgrammar_error"Undefined symbol <%s>"nameinletrecauxrsknown_symbols=matchrswith|[]->()|r::rs->letnts=get_referenced_nonterminalsrinlet_=List.iter(check_symbolknown_symbols)ntsinauxrsknown_symbolsinauxrs(get_left_hand_sidesrs)letreccheck_for_duplicatesrs=letsort_rulesrs=List.sort(fun(x,_)(y,_)->comparexy)rsinletrs=sort_rulesrsinmatchrswith|[]|[_]->()|r::r'::rs->let(name,_)=rinlet(name',_)=r'inifname=name'thenPrintf.ksprintfgrammar_error"Duplicate definition of symbol <%s>"nameelsecheck_for_duplicates(r'::rs)letreccheck_for_empty_rulesrs=matchrswith|[]->()|(name,[])::_->Printf.ksprintfgrammar_error"Empty rule for symbol <%s>"name|_::rs->check_for_empty_rulesrsletcheck_repeatsrs=letcheck_rule_repeats(_,r)=letsyms=List.map(funx->x.symbols)r|>List.concatinList.iter(funs->matchswith|Repeat(_,(min,max))->if(min>max)thenPrintf.ksprintfgrammar_error"Malformed range {%d,%d} (min > max)"minmax|_->())symsinList.itercheck_rule_repeatsrsletcheck_grammarrs=check_for_empty_rulesrs;check_for_duplicatesrs;check_for_nonexistentrs;check_repeatsrs(** Rule reduction *)(* We support weight for cases in rules with alternation,
so we need to support weighted random selection here. *)lettotal_weightl=List.fold_left(funxy->x+y.weight)0lletrecweighted_randomaccrl=matchlwith|[]->Printf.ksprintfgrammar_error"Rule with empty right-hand side"|[x]->x.symbols|hd::tl->letacc'=acc+hd.weightinifr<acc'thenhd.symbolselseweighted_randomacc'rtlletpick_elementl=lettw=total_weightlinletr=Random.inttwinweighted_random0rlletfind_productionnamegrammar=List.assoc_optnamegrammarletsort_rule_partsl=List.sort(funxy->comparex.weighty.weight)lletrecmake_stickysym=matchsymwith|Terminal(content,sym_flags)->Terminal(content,{sym_flagswithsticky_parent=true})|Nonterminal(name,sym_flags)->Nonterminal(name,{sym_flagswithsticky_parent=true})|Repeat(sym,num)->Repeat((make_stickysym),num)letis_stickysym_data=sym_data.sticky||sym_data.sticky_parentletreduce_symbol?(debug=false)?(debug_fun=print_endline)?(separator="")sym_stackgrammar=matchsym_stackwith|[]->(None,[])|sym::syms->matchsymwith|Terminal(content,sym_data)->let()=ifdebugthenPrintf.ksprintfdebug_fun{|Emitting terminal "%s"|}contentinletoutput=if((is_stickysym_data)||(separator=""))thencontentelsecontent^separatorin(Someoutput,syms)|Nonterminal(name,sym_data)->let()=ifdebugthenPrintf.ksprintfdebug_fun"Reducing symbol <%s>"nameinletrhs=find_productionnamegrammarinletrhs=ifOption.is_somerhsthenOption.getrhselsePrintf.ksprintfgrammar_error"Undefined symbol <%s>"nameinletnew_syms=pick_elementrhsinlet()=ifdebugthenPrintf.ksprintfdebug_fun"Alternative taken: %s"(string_of_rule_rhs_part{weight=1;symbols=new_syms})inletnew_syms=(* If the current symbol is not sticky,
we just push everything it produced on the stack and proceed.
But if it's sticky, we need to make sure not to emit the separator
when we emit the last terminal that was produced from that sticky parent.
So we mark the last symbol in the list with [sticky_parent] flag
to allow that marker to eventually propagate down to the last terminal.
*)ifnot(is_stickysym_data)thennew_symselsebeginletrev_new_syms=List.revnew_symsinletlast_sym=List.hdrev_new_symsinletlast_sym=make_stickylast_syminlast_sym::List.tlrev_new_syms|>List.revendinletsyms=List.appendnew_symssymsin(None,syms)|Repeat(s,(min,max))->if(min>max)thenPrintf.ksprintfgrammar_error"Malformed range {%d,%d} (min > max)"minmaxelse(* The argument of [Random.int] is an exclusive boundary,
while BNFGen ranges are inclusive.
We compensate for that by always adding +1 to the boundary. *)lettimes=if(min=max)thenminelse((Random.int(max-min+1))+min)inletnew_syms=List.inittimes(fun_->s)inlet()=ifdebugthenPrintf.ksprintfdebug_fun"Repetition range {%d,%d}, repeating %d times"minmaxtimesinletsyms=List.appendnew_symssymsin(None,syms)