123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237(*
* Copyright (c) 2022 Tarides <contact@tarides.com>
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*)typeelt=stringtypet={elt_length:int;hash_elt:elt->int;hash_elt_substring:Bigstringaf.t->off:int->len:int->int;empty_slot:elt;mutabledata:Bigstringaf.t;mutabledata_length:int;mutableslot_count:int;mutablecardinal:int;}typehashset=tmoduleSlot:sigtypetvalof_elt:hashset->elt->tvalof_elt_substring:hashset->src:Bigstringaf.t->src_off:int->tvalcontains:hashset->t->elt->boolvalcontains_substring:hashset->t->src:Bigstringaf.t->src_off:int->boolvalis_empty:hashset->t->boolvalget:hashset->t->eltvalset:hashset->t->elt->unitvalset_substring:hashset->t->src:Bigstringaf.t->src_off:int->unitvalnext:hashset->t->tvaliter_all:hashset->f:(t->unit)->unitvalto_offset:t->intend=structtypet=Offsetofint[@@ocaml.unboxed]letoffset_of_hashhhash=letindex=abshashmodh.slot_countinOffset(index*h.elt_length)letof_elthelt=offset_of_hashh(h.hash_eltelt)letof_elt_substringh~src~src_off=offset_of_hashh(h.hash_elt_substringsrc~off:src_off~len:h.elt_length)letcontainsh(Offsetoffset)string=Bigstringaf.memcmp_stringh.dataoffsetstring0h.elt_length=0letcontains_substringh(Offsetoffset)~src~src_off=Bigstringaf.memcmph.dataoffsetsrcsrc_offh.elt_length=0letis_emptyht=containshth.empty_slotletgeth(Offsetoffset)=Bigstringaf.substringh.data~off:offset~len:h.elt_lengthletseth(Offsetoffset)elt=Bigstringaf.blit_from_stringelt~src_off:0h.data~dst_off:offset~len:h.elt_lengthletset_substringh(Offsetoffset)~src~src_off=Bigstringaf.blitsrc~src_offh.data~dst_off:offset~len:h.elt_lengthletnexth(Offsetoffset)=Offset((offset+h.elt_length)modh.data_length)letiter_allhashset~f=assert(hashset.data_length<>0);f(Offset0);letrecaux=function|Offset0->()|offset->foffset;aux(nexthashsetoffset)inaux(nexthashset(Offset0))letto_offset(Offsetn)=nendletempty_all_slotst=Slot.iter_allt~f:(funslot->Slot.settslott.empty_slot)moduleDefault=structlethash:string->int=Hashtbl.hashlethash_substringt~off~len=hash(Bigstringaf.substringt~off~len)letnull~elt_length=String.makeelt_length'\000'endletcreate~elt_length?(initial_slots=0)?hash?hash_substring?null()=ifelt_length<=0thenFmt.invalid_arg"%s.create: element length must be strictly positive"__MODULE__;letempty_slot=matchnullwithSomex->x|None->Default.null~elt_lengthinlethash_elt,hash_elt_substring=match(hash,hash_substring)with|Someh,Someh'->(h,h')|None,None->(Default.hash,Default.hash_substring)|Some_,None|None,Some_->Fmt.invalid_arg"%s.create: must pass either both [hash] and [hash_substring] or \
neither"__MODULE__inletslot_count=letrecauxn=ifn>=initial_slotsthennelseifn*2>Sys.max_array_lengththennelseaux(n*2)inaux2inletdata_length=slot_count*elt_lengthinletdata=Bigstringaf.createdata_lengthinlett={data;data_length;hash_elt;hash_elt_substring;elt_length;empty_slot;slot_count;cardinal=0;}inempty_all_slotst;tletload_factort=letslots_available=Bigstringaf.lengtht.data/t.elt_lengthinFloat.of_intt.cardinal/.Float.of_intslots_availabletypeok_or_duplicate=[`Ok|`Duplicate]letrecunguarded_addtslotelt:ok_or_duplicate=ifSlot.is_emptytslotthen((* Write the element to this slot *)Slot.settslotelt;`Ok)elseifSlot.containstsloteltthen`Duplicateelseunguarded_addt(Slot.nexttslot)eltletrecunguarded_add_substringtslot~src~src_off:ok_or_duplicate=ifSlot.is_emptytslotthen((* Write the element to this slot *)Slot.set_substringtslot~src~src_off;`Ok)elseifSlot.contains_substringtslot~src~src_offthen`Duplicateelseunguarded_add_substringt(Slot.nexttslot)~src~src_offletresizet=letold_len=Bigstringaf.lengtht.datainletold_data=t.datainletnew_len=old_len+(t.slot_count/2*t.elt_length)inletnew_data=Bigstringaf.createnew_leninletold_t={twithdata=old_data;data_length=old_len}int.data<-new_data;t.data_length<-new_len;t.slot_count<-new_len/t.elt_length;empty_all_slotst;Slot.iter_allold_t~f:(funold_slot->ifnot(Slot.is_emptyold_told_slot)thenletsrc_off=Slot.to_offsetold_slotinletnew_slot=Slot.of_elt_substringt~src:old_t.data~src_offinletresult=unguarded_add_substringtnew_slot~src:old_t.data~src_offinassert(result=`Ok))(* Resize when the hashset is more than 90% full: *)letmax_load_factor=0.9letaddtelt=ifString.lengthelt<>t.elt_lengththenFmt.invalid_arg"%s.add: cannot write string of incorrect size to hashset"__MODULE__;ifString.equaleltt.empty_slotthenFmt.invalid_arg"%s.add: cannot write null value to hashset"__MODULE__;ifFloat.compare(load_factort)max_load_factor>=0thenresizet;letslot=Slot.of_eltteltinletresult=unguarded_addtsloteltinifresult=`Okthent.cardinal<-t.cardinal+1;resultletadd_exntelt=matchaddteltwith|`Ok->()|`Duplicate->Fmt.invalid_arg"%s.add_exn: element '%S' already present"__MODULE__eltletmemtelt=ifString.lengthelt<>t.elt_lengththenFmt.invalid_arg"%s.mem: cannot read string of incorrect size from hashset"__MODULE__;ifString.equaleltt.empty_slotthenFmt.failwith"%s.mem: cannot read null value from hashset"__MODULE__;letrecprobe_loopslot=ifSlot.containstsloteltthentrueelseifSlot.is_emptytslotthenfalseelseprobe_loop(Slot.nexttslot)inprobe_loop(Slot.of_elttelt)letinvariantinvariant_eltt=letelement_count=ref0inSlot.iter_allt~f:(funslot->ifnot(Slot.is_emptytslot)then(increlement_count;invariant_elt(Slot.gettslot)));assert(t.cardinal=!element_count)(* Using [Obj.reachable_words] directly on values of type [t] will give
inaccurate results since bigstrings are allocated on the C heap. As a
workaround, we provide a dedicated [reachable_words] function for use in
benchmarking this implementation. *)letreachable_wordst=letbytes_per_word=Sys.word_size/8in(t.data_length/bytes_per_word)+Obj.reachable_words(Obj.reprt)