123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617open!ImportmodulePublic=structtypestate_cst={token_buffer:Buffer.t;(* Starting positions of the current token *)mutabletoken_start_pos:Positions.pos}type('u,'s)kind=|Positions:(Positions.Builder.t,unit)kind|Sexp:(unit,Automaton_stack.t)kind|Sexp_with_positions:(Positions.Builder.t,Automaton_stack.t)kind|Cst:(state_cst,Automaton_stack.For_cst.t)kindtype('u,'s)state={mutableautomaton_state:int;kind:('u,'s)kind;mutabledepth:int;(* Number of opened #| when parsing a block comment *)mutableblock_comment_depth:int;(* Stack of ignoring depths; the current depth is pushed
each time a #; comment is entered. *)mutableignoring_stack:intlist;(* When parsing an escape sequence of the form "\\NNN" or "\\XX", this accumulates
the computed number *)mutableescaped_value:int;(* Buffer for accumulating atoms *)atom_buffer:Buffer.t;user_state:'u;mode:('u,'s)mode;mutablefull_sexps:int;mutableoffset:int(* global offset *);mutableline_number:int;mutablebol_offset:int(* offset of beginning of line *)}and('u,'s)mode=|Single|Many|Eagerof{got_sexp:('u,'s)state->'s->'s;mutableno_sexp_is_error:bool}letinitial_user_state:typeus.(u,s)kind->Positions.pos->u=funkindinitial_pos->matchkindwith|Positions->Positions.Builder.create~initial_pos()|Sexp->()|Sexp_with_positions->Positions.Builder.create~initial_pos()|Cst->(* [token_start_pos] is set to a dummy location here. It is properly set when we
start to capture a token from the input *){token_buffer=Buffer.create128;token_start_pos=Positions.beginning_of_file};;(* these magic numbers are checked in gen_parser_automaton.ml:
let () = assert (initial = 0)
let () = assert (to_int Error = 1) *)letinitial_state=0leterror_state=1letnew_state?(initial_pos=Positions.beginning_of_file)modekind={kind;depth=0;automaton_state=initial_state;block_comment_depth=0;ignoring_stack=[];escaped_value=0;atom_buffer=Buffer.create128;user_state=initial_user_statekindinitial_pos;mode;full_sexps=0;offset=initial_pos.offset;line_number=initial_pos.line;bol_offset=initial_pos.offset-initial_pos.col};;letmodet=t.modeletpositionst=Positions.Builder.contentst.user_stateletatom_buffert=t.atom_bufferletoffsetstate=state.offsetletlinestate=state.line_numberletcolumnstate=state.offset-state.bol_offsetletpositiont={Positions.col=columnt;line=linet;offset=offsett}letreset_user_state:typeus.(u,s)state->unit=funt->matcht.kindwith|Positions->Positions.Builder.resett.user_state(positiont)|Sexp->()|Sexp_with_positions->Positions.Builder.resett.user_state(positiont)|Cst->Buffer.cleart.user_state.token_buffer;;letreset?(pos=Positions.beginning_of_file)t=t.depth<-0;t.automaton_state<-initial_state;t.block_comment_depth<-0;t.ignoring_stack<-[];t.escaped_value<-0;t.full_sexps<-0;t.offset<-pos.offset;t.line_number<-pos.line;t.bol_offset<-pos.offset-pos.col;reset_user_statet;Buffer.cleart.atom_buffer;;typecontext=|Sexp_comment|Sexpletis_ignoringstate=matchstate.ignoring_stackwith|_::_->true|[]->false;;letis_not_ignoringstate=not(is_ignoringstate)letcontextstate=ifis_not_ignoringstatethenSexpelseSexp_commentlethas_unclosed_parenstate=state.depth>0letset_error_statestate=state.automaton_state<-error_statemoduleError=Parse_errorletautomaton_statestate=state.automaton_stateendopenPublicletraise_error:typeab.(a,b)state->_=funstate~at_eofreason->set_error_statestate;Parse_error.Private.raisereason{line=state.line_number;col=state.offset-state.bol_offset;offset=state.offset}~at_eof~atom_buffer:state.atom_buffer;;typenonreccontext=context=|Sexp_comment|Sexpletcontext=contexttype('u,'s)action=('u,'s)state->char->'s->'stype('u,'s)epsilon_action=('u,'s)state->'s->'sletcurrent_pos?(delta=0)state:Positions.pos=letoffset=state.offset+deltain{line=state.line_number;col=offset-state.bol_offset;offset};;letset_automaton_statestatex=state.automaton_state<-xletadvancestate=state.offset<-state.offset+1letadvance_eol:typeus.(u,s)state->unit=funstate->letnewline_offset=state.offsetinstate.offset<-newline_offset+1;state.bol_offset<-state.offset;state.line_number<-state.line_number+1;matchstate.kindwith|Positions->Positions.Builder.add_newlinestate.user_state~offset:newline_offset|Sexp_with_positions->Positions.Builder.add_newlinestate.user_state~offset:newline_offset|_->();;letblock_comment_depthstate=state.block_comment_depthletadd_token_char:typeus.(u,s)action=funstatecharstack->matchstate.kindwith|Cst->Buffer.add_charstate.user_state.token_bufferchar;stack|_->stack;;letadd_atom_charstatecstack=Buffer.add_charstate.atom_bufferc;stack;;letadd_quoted_atom_charstatecstack=Buffer.add_charstate.atom_bufferc;add_token_charstatecstack;;letcheck_new_sexp_allowedstate=letis_single=matchstate.modewith|Single->true|_->falseinifis_single&&state.full_sexps>0&&is_not_ignoringstatethenraise_errorstate~at_eof:falseToo_many_sexps;;letadd_posstate~delta=Positions.Builder.addstate.user_state~offset:(state.offset+delta);;letadd_first_char:typeus.(u,s)action=funstatecharstack->check_new_sexp_allowedstate;Buffer.add_charstate.atom_bufferchar;(* For non-quoted atoms, we save both positions at the end. We can always determine the
start position from the end position and the atom length for non-quoted atoms.
Doing it this way allows us to detect single characater atoms for which we need to
save the position twice. *)stack;;leteps_add_first_char_hash:typeus.(u,s)epsilon_action=funstatestack->check_new_sexp_allowedstate;Buffer.add_charstate.atom_buffer'#';stack;;letstart_quoted_string:typeus.(u,s)action=funstate_charstack->check_new_sexp_allowedstate;matchstate.kindwith|Positions->ifis_not_ignoringstatethenadd_posstate~delta:0;stack|Sexp_with_positions->ifis_not_ignoringstatethenadd_posstate~delta:0;stack|Cst->state.user_state.token_start_pos<-current_posstate;Buffer.add_charstate.user_state.token_buffer'"';stack|Sexp->stack;;letadd_escapedstatecstack=letc'=matchcwith|'n'->'\n'|'r'->'\r'|'b'->'\b'|'t'->'\t'|'\\'|'\''|'"'->c|_->Buffer.add_charstate.atom_buffer'\\';cinBuffer.add_charstate.atom_bufferc';add_token_charstatecstack;;leteps_add_escaped_crstatestack=Buffer.add_charstate.atom_buffer'\r';stack;;letdec_valc=Char.codec-Char.code'0'lethex_valc=matchcwith|'0'..'9'->Char.codec-Char.code'0'|'a'..'f'->Char.codec-Char.code'a'+10|_->Char.codec-Char.code'A'+10;;letadd_dec_escape_charstatecstack=state.escaped_value<-(state.escaped_value*10)+dec_valc;add_token_charstatecstack;;letadd_last_dec_escape_charstatecstack=letvalue=(state.escaped_value*10)+dec_valcinstate.escaped_value<-0;ifvalue>255thenraise_errorstate~at_eof:falseEscape_sequence_out_of_range;Buffer.add_charstate.atom_buffer(Char.chrvalue);add_token_charstatecstack;;letcomment_add_last_dec_escape_charstatecstack=letvalue=(state.escaped_value*10)+dec_valcinstate.escaped_value<-0;ifvalue>255thenraise_errorstate~at_eof:falseEscape_sequence_out_of_range;add_token_charstatecstack;;letadd_hex_escape_charstatecstack=state.escaped_value<-(state.escaped_valuelsl4)lorhex_valc;add_token_charstatecstack;;letadd_last_hex_escape_charstatecstack=letvalue=(state.escaped_valuelsl4)lorhex_valcinstate.escaped_value<-0;Buffer.add_charstate.atom_buffer(Char.chrvalue);add_token_charstatecstack;;letopening:typeus.(u,s)state->char->s->s=funstate_charstack->check_new_sexp_allowedstate;state.depth<-state.depth+1;matchstate.kindwith|Positions->ifis_not_ignoringstatethenadd_posstate~delta:0;stack|Sexp->ifis_not_ignoringstatethenOpenstackelsestack|Sexp_with_positions->ifis_not_ignoringstatethen(add_posstate~delta:0;Openstack)elsestack|Cst->Open(current_posstate,stack);;letdo_reset_positionsstate=Positions.Builder.resetstate.user_state{line=state.line_number;col=state.offset-state.bol_offset;offset=state.offset};;letreset_positions:typeus.(u,s)state->unit=funstate->matchstate.kindwith|Positions->do_reset_positionsstate|Sexp_with_positions->do_reset_positionsstate|Sexp->()|Cst->();;lettoplevel_sexp_or_comment_addedstatestack~delta=matchstate.modewith|Single|Many->stack|Eager{got_sexp=f;_}->(* Modify the offset so that [f] get a state pointing to the end of the current
s-expression *)letsaved_offset=state.offsetinstate.offset<-state.offset+delta;letsaved_full_sexps=state.full_sexpsin(matchfstatestackwith|exceptione->set_error_statestate;raisee|stack->(* This assert is not a full protection against the user mutating the state but
it should catch most cases. *)assert(state.offset=saved_offset+delta&&state.full_sexps=saved_full_sexps);state.offset<-saved_offset;reset_positionsstate;stack);;letis_top_levelstate=is_not_ignoringstate&&state.depth=0letcomment_added_assuming_cststatestack~delta=ifis_top_levelstatethentoplevel_sexp_or_comment_addedstatestack~deltaelsestack;;letmaybe_pop_ignoring_stackstate=matchstate.ignoring_stackwith|inner_comment_depth::_tlwheninner_comment_depth>state.depth->raise_errorstate~at_eof:falseSexp_comment_without_sexp|inner_comment_depth::tlwheninner_comment_depth=state.depth->state.ignoring_stack<-tl;true|_->false;;letsexp_added:typeus.(u,s)state->s->delta:int->s=funstatestack~delta->letis_comment=maybe_pop_ignoring_stackstateinifis_top_levelstatethen(ifnotis_commentthenstate.full_sexps<-state.full_sexps+1;if(notis_comment)||matchstate.kindwith|Cst->true|_->falsethentoplevel_sexp_or_comment_addedstatestack~deltaelsestack)elsestack;;letrecmake_listacc:Automaton_stack.t->Automaton_stack.t=function|Empty->assertfalse|Openstack->Sexp(Listacc,stack)|Sexp(sexp,stack)->make_list(sexp::acc)stack;;letadd_comment_to_stack_cstcomment(stack:Automaton_stack.For_cst.t):Automaton_stack.For_cst.t=matchstackwith|In_sexp_commentr->In_sexp_comment{rwithrev_comments=comment::r.rev_comments}|_->T_or_comment(Commentcomment,stack);;letadd_sexp_to_stack_cstsexp:Automaton_stack.For_cst.t->Automaton_stack.For_cst.t=function|In_sexp_comment{hash_semi_pos;rev_comments;stack}->letcomment:Cst.comment=Sexp_comment{hash_semi_pos;comments=List.revrev_comments;sexp}inadd_comment_to_stack_cstcommentstack|stack->T_or_comment(Sexpsexp,stack);;letrecmake_list_cstend_posacc:Automaton_stack.For_cst.t->Automaton_stack.For_cst.t=function|T_or_comment(t,stack)->make_list_cstend_pos(t::acc)stack|Open(start_pos,stack)->letsexp:Cst.t=List{loc={start_pos;end_pos};elements=acc}inadd_sexp_to_stack_cstsexpstack|Empty|In_sexp_comment_->assertfalse;;letclosing:typeus.(u,s)state->char->s->s=funstate_charstack->ifstate.depth>0then(letstack=matchstate.kindwith|Positions->(* Note we store end positions as inclusive in [Positions.t], so we use [delta:0],
while in the [Cst] case we save directly the final ranges, so we use
[delta:1]. *)ifis_not_ignoringstatethenadd_posstate~delta:0;stack|Sexp->ifis_not_ignoringstatethenmake_list[]stackelsestack|Sexp_with_positions->ifis_not_ignoringstatethen(add_posstate~delta:0;make_list[]stack)elsestack|Cst->make_list_cst(current_posstate~delta:1)[]stackinstate.depth<-state.depth-1;sexp_addedstatestack~delta:1)elseraise_errorstate~at_eof:falseClosed_paren_without_opened;;letmake_loc?(delta=0)state:Positions.range={start_pos=state.user_state.token_start_pos;end_pos=current_posstate~delta};;(* This is always called on the position exactly following the last character of a
non-quoted atom *)letadd_non_quoted_atom_posstate~atom=letlen=String.lengthatominiflen=1thenPositions.Builder.add_twicestate.user_state~offset:(state.offset-1)else(add_posstate~delta:(-len);add_posstate~delta:(-1));;leteps_push_atom:typeus.(u,s)epsilon_action=funstatestack->letstr=Buffer.contentsstate.atom_bufferinBuffer.clearstate.atom_buffer;letstack=matchstate.kindwith|Positions->ifis_not_ignoringstatethenadd_non_quoted_atom_posstate~atom:str;stack|Sexp->ifis_not_ignoringstatethenSexp(Atomstr,stack)elsestack|Sexp_with_positions->ifis_not_ignoringstatethen(add_non_quoted_atom_posstate~atom:str;Sexp(Atomstr,stack))elsestack|Cst->letloc:Positions.range={start_pos=current_posstate~delta:(-String.lengthstr);end_pos=current_posstate~delta:0}inletsexp:Cst.t=Atom{loc;atom=str;unescaped=Somestr}inadd_sexp_to_stack_cstsexpstackinsexp_addedstatestack~delta:0;;letpush_quoted_atom:typeus.(u,s)action=funstate_charstack->letstr=Buffer.contentsstate.atom_bufferinBuffer.clearstate.atom_buffer;letstack=matchstate.kindwith|Positions->ifis_not_ignoringstatethenadd_posstate~delta:0;stack|Sexp->ifis_not_ignoringstatethenSexp(Atomstr,stack)elsestack|Sexp_with_positions->ifis_not_ignoringstatethen(add_posstate~delta:0;Sexp(Atomstr,stack))elsestack|Cst->letbuf=state.user_state.token_bufferinBuffer.add_charbuf'"';lets=Buffer.contentsbufinBuffer.clearbuf;letsexp:Cst.t=Atom{loc=make_locstate~delta:1;atom=str;unescaped=Somes}inadd_sexp_to_stack_cstsexpstackinsexp_addedstatestack~delta:1;;letstart_sexp_comment:typeus.(u,s)action=funstate_charstack->state.ignoring_stack<-state.depth::state.ignoring_stack;matchstate.kindwith|Cst->In_sexp_comment{hash_semi_pos=current_posstate~delta:(-1);rev_comments=[];stack}|_->stack;;letstart_block_comment:typeus.(u,s)state->char->s->s=funstatecharstack->state.block_comment_depth<-state.block_comment_depth+1;matchstate.kindwith|Positions->stack|Sexp->stack|Sexp_with_positions->stack|Cst->ifstate.block_comment_depth=1then(state.user_state.token_start_pos<-current_posstate~delta:(-1);Buffer.add_charstate.user_state.token_buffer'#');Buffer.add_charstate.user_state.token_bufferchar;stack;;letend_block_comment:typeus.(u,s)state->char->s->s=funstatecharstack->state.block_comment_depth<-state.block_comment_depth-1;matchstate.kindwith|Positions->stack|Sexp->stack|Sexp_with_positions->stack|Cst->letbuf=state.user_state.token_bufferinBuffer.add_charbufchar;ifstate.block_comment_depth=0then(lets=Buffer.contentsbufinBuffer.clearbuf;letcomment:Cst.comment=Plain_comment{loc=make_locstate~delta:1;comment=s}inletstack=add_comment_to_stack_cstcommentstackincomment_added_assuming_cststatestack~delta:1)elsestack;;letstart_line_comment:typeus.(u,s)action=funstatecharstack->matchstate.kindwith|Cst->state.user_state.token_start_pos<-current_posstate;Buffer.add_charstate.user_state.token_bufferchar;stack|_->stack;;letend_line_comment:typeus.(u,s)epsilon_action=funstatestack->matchstate.kindwith|Positions->stack|Sexp->stack|Sexp_with_positions->stack|Cst->letbuf=state.user_state.token_bufferinlets=Buffer.contentsbufinBuffer.clearbuf;letcomment:Cst.comment=Plain_comment{loc=make_locstate;comment=s}inletstack=add_comment_to_stack_cstcommentstackincomment_added_assuming_cststatestack~delta:0;;leteps_eoi_check:typeus.(u,s)epsilon_action=funstatestack->ifstate.depth>0thenraise_errorstate~at_eof:trueUnclosed_paren;ifis_ignoringstatethenraise_errorstate~at_eof:trueSexp_comment_without_sexp;ifstate.full_sexps=0then(matchstate.modewith|Many|Eager{no_sexp_is_error=false;_}->()|Single|Eager{no_sexp_is_error=true;_}->raise_errorstate~at_eof:trueNo_sexp_found_in_input);stack;;