123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554(* This module builds a buffer of "instructions", in order to represent a compact sequence
of delimiting positions and newlines. The parser stores the positions of each:
- newline
- beginning of atom
- end of atom
- left parenthesis
- right parenthesis
Instructions are encoded as a sequence bits. The next instruction is determined by
looking at the next few bits:
- bit 0 represents a saved position followed by an offset increment
- bits 10 represent an offset increment
- bits 110 are followed by 5 bits of payload. The 5-bit payloads of any subsequent 110-
instructions are squashed to form a number (least significant 5-bit chunk first).
This number + 5 represents an offset increment
- bits 1110 marks the beginning of a new line (with offset incremented)
- bits 1111 represent a position saved twice followed by an offset increment
For instance let's consider the following sexp:
{[
{|
(abc
"foo
bar"
)
|}
]}
the sequence of instructions to record in order to reconstruct the position of any
sub-sexp is:
- 0 save position and advance 1: first '('
- 0 save position and advance 1: start of "abc"
- 10 advance 1
- 0 save position and advance 1: end of "abc"
- 1110 newline
- 1100_0001 advance 6
- 0 save position and advance 1: start of "foo\n bar"
- 10 advance 1
- 10 advance 1
- 10 advance 1
- 1110 newline
- 1100_0000 advance 5
- 0 save position and advance 1: end of "foo\n bar"
- 1110 newline
- 0 save position and advance 1: last ')'
(we save the position after the closing parenthesis)
The total sequence is 42 bits, so we need 6 bytes to store it
The sequence of bits is encoded as a sequence of 16-bit values, where the earlier bits
are most significant.
Note that the parser stores the end positions as inclusive. This way only single
character atoms require a double positions. If we were storing end positions as
exclusive, we would need double positions for [)(] and [a(], which are likely to be
frequent in s-expressions printed with the non [_hum] printer. We expect single
character atoms to be less frequent so it makes sense to penalize them instead.
*)openImportopenPpx_sexp_conv_libmoduleList=ListLabelstypepos={line:int;col:int;offset:int}[@@deriving_inlinesexp_of]letsexp_of_pos=(function|{line=v_line;col=v_col;offset=v_offset}->letbnds=[]inletbnds=letarg=sexp_of_intv_offsetin(Ppx_sexp_conv_lib.Sexp.List[Ppx_sexp_conv_lib.Sexp.Atom"offset";arg])::bndsinletbnds=letarg=sexp_of_intv_colin(Ppx_sexp_conv_lib.Sexp.List[Ppx_sexp_conv_lib.Sexp.Atom"col";arg])::bndsinletbnds=letarg=sexp_of_intv_linein(Ppx_sexp_conv_lib.Sexp.List[Ppx_sexp_conv_lib.Sexp.Atom"line";arg])::bndsinPpx_sexp_conv_lib.Sexp.Listbnds:pos->Ppx_sexp_conv_lib.Sexp.t)[@@@end]letcompare_pos=Caml.compareletbeginning_of_file={line=1;col=0;offset=0}letshift_pospos~cols={poswithcol=pos.col+cols;offset=pos.offset+cols}typerange={start_pos:pos;end_pos:pos}[@@deriving_inlinesexp_of]letsexp_of_range=(function|{start_pos=v_start_pos;end_pos=v_end_pos}->letbnds=[]inletbnds=letarg=sexp_of_posv_end_posin(Ppx_sexp_conv_lib.Sexp.List[Ppx_sexp_conv_lib.Sexp.Atom"end_pos";arg])::bndsinletbnds=letarg=sexp_of_posv_start_posin(Ppx_sexp_conv_lib.Sexp.List[Ppx_sexp_conv_lib.Sexp.Atom"start_pos";arg])::bndsinPpx_sexp_conv_lib.Sexp.Listbnds:range->Ppx_sexp_conv_lib.Sexp.t)[@@@end]letcompare_range=Caml.compareletmake_range_incl~start_pos~last_pos={start_pos;end_pos=shift_poslast_pos~cols:1}moduleChunk:sig(** Represents an array of [length/2] signed 16-bit values *)typet(** Length in bytes. *)vallength:intvalalloc:unit->t(** [get16 ~pos] and [set16 ~pos] manipulate the [pos/2]th stored value.
[pos] must be even.
[set16 x] only uses the 16 least significant bits of [x]. *)valget16:t->pos:int->intvalset16:t->pos:int->int->unitend=structtypet=bytes(* OCaml strings always waste two bytes at the end, so we take a power of two minus two
to be sure we don't waste space. *)letlength=62letalloc()=Bytes.createlengthexternalget16:bytes->pos:int->int="%caml_string_get16"externalset16:bytes->pos:int->int->unit="%caml_string_set16"(* If we want to make a [Positions.t] serializable:
{[
external bswap16 : int -> int = "%bswap16";;
let get16 =
if Caml.Sys.arch_big_endian then
fun buf ~pos -> get16 buf ~pos |> bswap16
else
get16
let set16 =
if Caml.Sys.arch_big_endian then
fun buf ~pos x -> set16 buf ~pos (bswap16 x)
else
set16
]}
*)endtypet_={chunks:Chunk.tlist;(* [num_bytes * 8 + extra_bits] is the number of bits stored in [chunks].
The last [extra_bits] bits will be stored as the *least* significant bits
of the appropriate pair of bytes of the last chunk. *)num_bytes:int;extra_bits:int;initial_pos:pos}typet=t_Lazy.tletmemory_footprint_in_bytes(lazyt)=letnum_fields=4inletheader_words=1inletword_bytes=matchSys.word_sizewith|32->4|64->8|_->assertfalseinletchunk_words=letdiv_ceilab=(a+b-1)/binletn=div_ceil(Chunk.length+1(* NUL terminating bytes *)+1(* number of wasted bytes to fill a word *))word_bytesinn+header_wordsinletpos_fields=3inletpos_words=header_words+pos_fieldsinletlist_cons_words=header_words+2in(header_words+num_fields+pos_words+List.lengtht.chunks*(chunk_words+list_cons_words))*word_bytesmoduleBuilder=structtypet={mutablechunk:Chunk.t;mutablechunk_pos:int;mutablefilled_chunks:Chunk.tlist(* Filled chunks in reverse order *);mutableoffset:int(* Offset of the last saved position or newline plus
one, or [initial_pos] *);mutableint_buf:int(* the [num_bits] least significant bits of [int_buf]
are the bits not yet pushed to [chunk]. *);mutablenum_bits:int(* number of bits stored in [int_buf] *);mutableinitial_pos:pos}letinvariantt=assert(t.chunk_pos>=0&&t.chunk_pos<=Chunk.length);assert(t.offset>=t.initial_pos.offset);assert(t.num_bits<=15)letcheck_invariant=falseletinvariantt=ifcheck_invarianttheninvarianttletcreate?(initial_pos=beginning_of_file)()={chunk=Chunk.alloc();chunk_pos=0;filled_chunks=[];offset=initial_pos.offset;int_buf=0;num_bits=0;initial_pos};;letresett(pos:pos)=(* We need a new chunk as [contents] keeps the current chunk in the closure of the
lazy value. *)t.chunk<-Chunk.alloc();t.chunk_pos<-0;t.filled_chunks<-[];t.offset<-pos.offset;t.int_buf<-0;t.num_bits<-0;t.initial_pos<-pos;;;let[@inlinednever]alloc_new_chunkt=t.filled_chunks<-t.chunk::t.filled_chunks;t.chunk<-Chunk.alloc();t.chunk_pos<-0;;letadd_uint16tn=ift.chunk_pos=Chunk.lengththenalloc_new_chunkt;Chunk.set16t.chunk~pos:t.chunk_posn;;;letadd_bitstn~num_bits=letint_buf=(t.int_buflslnum_bits)lorninletnum_bits=t.num_bits+num_bitsint.int_buf<-int_buf;ifnum_bits<16thent.num_bits<-num_bitselsebeginletnum_bits=num_bits-16int.num_bits<-num_bits;add_uint16t(int_buflsrnum_bits);t.chunk_pos<-t.chunk_pos+2;(* no need to clear the bits of int_buf we just wrote, as further set16 will ignore
these extra bits. *)end;;letcontentst=(* Flush the current [t.int_buf] *)add_uint16tt.int_buf;letrev_chunks=t.chunk::t.filled_chunksinletchunk_pos=t.chunk_posinletextra_bits=t.num_bitsinletinitial_pos=t.initial_posinlazy{chunks=List.revrev_chunks;num_bytes=(List.lengthrev_chunks-1)*Chunk.length+chunk_pos;extra_bits;initial_pos};;letlong_shifttn=letn=ref(n-5)inwhile!n>0doadd_bitst((0b1100_0000lor(!nland0b0001_1111)))~num_bits:8;n:=!nlsr5;done;;(* precondition: n >= 5 *)let[@inlinednever]add_gen_slowtn~instr~instr_bits=long_shifttn;add_bitstinstr~num_bits:instr_bits;;letshift4=0b10_10_10_10let[@inlinealways]add_gent~offset~instr~instr_bits=invariantt;letn=offset-t.offsetint.offset<-offset+1;matchnwith|0|1|2|3|4->letnum_bits=(nlsl1)+instr_bitsinadd_bitst(((shift4lslinstr_bits)lorinstr)land(1lslnum_bits-1))~num_bits|5|6|7|8|9|10|11|12|13|14|15|16|17|18|19|20|21|22|23|24|25|26|27|28|29|30|31|32|33|34|35|36->add_bitst(((0b1100_0000lor(n-5))lslinstr_bits)lorinstr)~num_bits:(8+instr_bits)|_->ifn<0theninvalid_arg"Parsexp.Positions.add_gen";add_gen_slowtn~instr~instr_bits;;letaddt~offset=add_gent~offset~instr:0b0~instr_bits:1letadd_twicet~offset=add_gent~offset~instr:0b1111~instr_bits:4letadd_newlinet~offset=add_gent~offset~instr:0b1110~instr_bits:4endtypepositions=tmoduleIterator:sigtypetvalcreate:positions->texceptionNo_more(* [advance t ~skip] ignores [skip] saved positions and returns the next saved position.
Raises [No_more] when reaching the end of the position set. *)valadvance_exn:t->skip:int->posend=structtypet={mutablechunk:Chunk.t;mutablechunks:Chunk.tlist;(* [num_bytes * 8 + extra_bits] is the number of bits available from [instr_pos] in
[chunk :: chunks]. *)mutablenum_bytes:int;extra_bits:int;mutableinstr_pos:int(* position in [chunk] *);mutableoffset:int;mutableline:int;mutablebol:int;mutableint_buf:int;mutablenum_bits:int(* Number of bits not yet consumed in [int_buf] *);mutablepending:posoption}letcreate(lazyp:positions)=matchp.chunkswith|[]->assertfalse|chunk::chunks->{chunk;chunks;num_bytes=p.num_bytes;extra_bits=p.extra_bits;instr_pos=0;offset=p.initial_pos.offset;line=p.initial_pos.line;bol=p.initial_pos.offset-p.initial_pos.col;int_buf=0;num_bits=0;pending=None};;exceptionNo_moreletno_more()=raise_notraceNo_morelet[@inlinednever]fetch_chunkt=matcht.chunkswith|[]->assertfalse|chunk::chunks->t.instr_pos<-0;t.num_bytes<-t.num_bytes-Chunk.length;t.chunk<-chunk;t.chunks<-chunks;;letfetcht=ift.instr_pos>t.num_bytesthenno_more();ift.instr_pos=Chunk.lengththenfetch_chunkt;letv=Chunk.get16t.chunk~pos:t.instr_posinletadded_bits=ift.instr_pos=t.num_bytesthent.extra_bitselse16int.int_buf<-(t.int_buflsladded_bits)lor(vland((1lsladded_bits)-1));t.num_bits<-t.num_bits+added_bits;t.instr_pos<-t.instr_pos+2;;letnext_instruction_bitst~num_bits=ift.num_bits<num_bitsthenbeginfetcht;ift.num_bits<num_bitsthenno_more()end;letn=(t.int_buflsr(t.num_bits-num_bits))land((1lslnum_bits)-1)int.num_bits<-t.num_bits-num_bits;n;;(* [offset_shift] and [offset_shift_num_bits] encode the offset number
specified by the immediately preceding [110] instructions. *)letrecadvancet~skip~offset_shift~offset_shift_num_bits=matchnext_instruction_bitst~num_bits:1with|0->(* bit seq 0 -> new item *)letoffset=t.offset+offset_shiftint.offset<-offset+1;ifskip=0then{line=t.line;col=offset-t.bol;offset=offset}elseadvancet~skip:(skip-1)~offset_shift:0~offset_shift_num_bits:0|_->matchnext_instruction_bitst~num_bits:1with|0->(* bit seq 10 -> shift *)t.offset<-t.offset+offset_shift+1;advancet~skip~offset_shift:0~offset_shift_num_bits:0|_->matchnext_instruction_bitst~num_bits:1with|0->(* bit seq 110 -> long shift *)letn=next_instruction_bitst~num_bits:5inletoffset_shift=ifoffset_shift_num_bits=0then5elseoffset_shiftinadvancet~skip~offset_shift:(offset_shift+(nlsloffset_shift_num_bits))~offset_shift_num_bits:(offset_shift_num_bits+5)|_->matchnext_instruction_bitst~num_bits:1with|0->(* bit seq 1110 -> newline *)t.offset<-t.offset+offset_shift+1;t.bol<-t.offset;t.line<-t.line+1;advancet~skip~offset_shift:0~offset_shift_num_bits:0|_->(* bit seq 1111 -> 2 new items *)letoffset=t.offset+offset_shiftint.offset<-offset+1;ifskip<=1thenbeginletpos={line=t.line;col=offset-t.bol;offset=offset}inifskip=0thent.pending<-Somepos;posendelseadvancet~skip:(skip-2)~offset_shift:0~offset_shift_num_bits:0letadvance_exnt~skip=matcht.pendingwith|Somepos->t.pending<-None;ifskip=0thenposelseadvancet~skip:(skip-1)~offset_shift:0~offset_shift_num_bits:0|None->advancet~skip~offset_shift:0~offset_shift_num_bits:0endletfindtab=ifa<0||b<=atheninvalid_arg"Parsexp.Positions.find";letiter=Iterator.createtintryletstart_pos=Iterator.advance_exniter~skip:ainletlast_pos=Iterator.advance_exniter~skip:(b-a-1)inmake_range_incl~start_pos~last_poswithIterator.No_more->failwith"Parsexp.Position.find";;letrecsub_sexp_count(sexp:Sexp.t)=matchsexpwith|Atom_->1|Listl->List.fold_leftl~init:1~f:(funaccx->acc+sub_sexp_countx);;moduleSexp_search=structexceptionFoundofintletrecloop~subindex(sexp:Sexp.t)=ifsexp==subthenraise_notrace(Foundindex)elsematchsexpwith|Atom_->index+2|Listl->letindex=loop_list~sub(index+1)linindex+1andloop_list~subindex(sexps:Sexp.tlist)=List.fold_leftsexps~init:index~f:(loop~sub)letfinalizet~suba=letb=a+sub_sexp_countsub*2-1inSome(findtab)letfind_sub_sexp_phystsexp~sub=matchloop~sub0sexpwith|(_:int)->None|exception(Foundn)->finalizet~subn;;letfind_sub_sexp_in_list_phystsexps~sub=matchloop_list~sub0sexpswith|(_:int)->None|exception(Foundn)->finalizet~subn;;endletfind_sub_sexp_phys=Sexp_search.find_sub_sexp_physletfind_sub_sexp_in_list_phys=Sexp_search.find_sub_sexp_in_list_physletto_listt=letiter=Iterator.createtinletrecloopacc=matchIterator.advance_exniter~skip:0with|exceptionIterator.No_more->List.revacc|pos->loop(pos::acc)inloop[];;letto_arrayt=to_listt|>Array.of_listletcomparet1t2=Caml.compare(to_arrayt1)(to_arrayt2)letsexp_of_tt=sexp_of_arraysexp_of_pos(to_arrayt)