123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200(* Join csv files on some common field *)openCoremoduleCsv=Csvlib.CsvmoduleT=structtypet=|Full|Inner|Left[@@derivingcompare,enumerate,sexp]endincludeTletparam=letjoin_switch="-join"inletkeys_need_not_occur_in_all_files_switch="-keys-need-not-occur-in-all-files"inlet%map_open.Commandt=Enum.make_paramjoin_switch(moduleT)~doc:"as in SQL (default: inner)"~f:optionalandkeys_need_not_occur_in_all_files=flagkeys_need_not_occur_in_all_files_switchno_arg~doc:" deprecated alias for -join full"inmatcht,keys_need_not_occur_in_all_fileswith|Somet,false->t|None,false->Inner|None,true->Full|Some_,true->raise_s[%message"cannot specify both"join_switchkeys_need_not_occur_in_all_files_switch];;moduleRow=structtypet=stringlist[@@derivingsexp]endmoduleKey:sigtypetincludeComparable.Swithtypet:=tincludeHashable.Swithtypet:=tvalcreate:stringarray->tvalto_list:t->stringlistend=structmoduleT=structopenPpx_hash_lib.Std.Hash.Builtintype'aarray_frozen='aarray[@@derivingcompare,sexp](* [array_frozen] can derive hash. We don't expose any mutation. *)typet=stringarray_frozen[@@derivingcompare,hash,sexp]endincludeTincludeComparable.Make(T)includeHashable.Make(T)letcreatet=tletto_list=Array.to_listendmoduleRows_by_key=struct(* The [header] and each [Row.t] in [data_by_key] are missing the key field. *)typet={data_by_key:Row.tlistKey.Map.t(* header for each row in [data_by_key]. *);header:Row.t}[@@derivingfields~getters,sexp]letload_rowsfile~sep=protectx(In_channel.createfile)~f:(funchannel->letrows=Csv.load_in~separator:sepchannelinmatchList.dedup_and_sort(List.maprows~f:(List.length:>_->_))~compare:[%compare:int]with|[]|[_]->rows|_->failwithf"rows in %s have different lengths"file())~finally:In_channel.close;;letload~file_name~key_fields~sep=matchload_rows~sepfile_namewith|[]->failwithf"file %s is empty"file_name()|header::rows->lethmap=matchString.Map.of_alist(List.mapiheader~f:(funih->h,i))with|`Okmap->map|`Duplicate_keyh->failwithf"repeated column %s in %s"hfile_name()inletkey_indices=Array.mapkey_fields~f:(funkey_field->matchMap.findhmapkey_fieldwith|Somei->i|None->failwithf"No %s column in %s"key_fieldfile_name())inletdata_indices=letkey_fields=String.Set.of_arraykey_fieldsinList.filter_mapiheader~f:(funih->ifSet.memkey_fieldshthenNoneelseSomei)inletdata_by_key=List.maprows~f:(funrow->letrow=Array.of_listrowin(Key.create@@Array.mapkey_indices~f:(Array.getrow),List.mapdata_indices~f:(Array.getrow)))|>Key.Map.of_alist_multiin{data_by_key;header=List.mapdata_indices~f:(Array.get(Array.of_listheader))};;endmoduleJoin_result:sigtypetvalempty_for_left_join:Rows_by_key.tlist->tvalempty_for_inner_join:Rows_by_key.tlist->tvalempty_for_full_join:Rows_by_key.tlist->t(* Any join can be expressed as a left join with the correct keys on the left side. *)valdo_left_join:t->Rows_by_key.t->tvalto_rows:t->Row.tSequence.tend=structtypet=(Key.t*Row.tSequence.t)Sequence.tletempty_of_keyskeys=Set.to_sequencekeys|>Sequence.map~f:(funkey->key,Sequence.singleton[]);;letempty_for_left_join=function|[]->failwith"join requires at least one csv."|car::_->empty_of_keys(Map.key_set(Rows_by_key.data_by_keycar:_Key.Map.t));;letreduce_keysmaps~f=Sequence.of_listmaps|>Sequence.map~f:Rows_by_key.data_by_key|>Sequence.map~f:Map.key_set|>Sequence.reduce_exn~f|>empty_of_keys;;letempty_for_inner_join=reduce_keys~f:Set.interletempty_for_full_join=reduce_keys~f:Set.unionletdo_left_jointrows=letempty_right_side_of_rows=Rows_by_key.headerrows|>List.map~f:(const"")|>Sequence.singletoninSequence.mapt~f:(fun(key,left_side_of_rows)->letright_side_of_rows:Row.tSequence.t=matchMap.find(Rows_by_key.data_by_keyrows)keywith|None->empty_right_side_of_rows|Someright_rows->Sequence.of_listright_rowsin(key,Sequence.concat_mapleft_side_of_rows~f:(funleft_side_of_row->Sequence.mapright_side_of_rows~f:(funright_side_of_row->left_side_of_row@right_side_of_row))));;letto_rowsjoin_result=Sequence.concat_mapjoin_result~f:(fun(key,rows)->Sequence.maprows~f:(funrow->List.append(Key.to_listkey)row));;endletjointfiles~key_fields~sep=letmaps=List.mapfiles~f:(funfile_name->Rows_by_key.load~file_name~key_fields~sep)inletcombined_header=List.append(Array.to_listkey_fields)(List.concat_mapmaps~f:Rows_by_key.header)inOption.iter(List.find_a_dupcombined_header~compare:[%compare:string])~f:(funduplicate->raise_s[%message"Only key fields may appear in multiple files."(duplicate:string)(combined_header:stringlist)]);letinit=matchtwith|Full->Join_result.empty_for_full_joinmaps|Inner->Join_result.empty_for_inner_joinmaps|Left->Join_result.empty_for_left_joinmapsinletrows=List.foldmaps~init~f:Join_result.do_left_join|>Join_result.to_rowsinSequence.shift_rightrowscombined_header;;