123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869(** [distance s t] computes the Levenshtein edit distance between two UTF-8
encoded strings. This function correctly handles multibyte characters,
including Japanese, Chinese, emoji, and other Unicode symbols.
The edit distance is the minimum number of single-character edits
(insertions, deletions, or substitutions) required to change one string into
the other.
Example:
- distance "kitten" "sitting" = 3
- distance "こんにちは" "こんばんは" = 2
- distance "🍎" "🍏" = 1 *)letdecode_utf8_array(s:string):Uchar.tarray=(* Decodes a UTF-8 encoded string [s] into an array of Unicode code points
(Uchar.t). OCaml's [String.length] returns byte length, not character
count, so we use the [Uutf] library to properly decode multibyte
characters. *)letrecauxaccdecoder=matchUutf.decodedecoderwith|`Ucharu->aux(u::acc)decoder(* Append valid Unicode character to accumulator *)|`End->Array.of_list(List.revacc)(* Reverse and convert list to array at end *)|`Malformed_->aux(Uchar.of_char'?'::acc)decoder(* Replace malformed bytes with '?' *)|`Await->assertfalse(* `Await` won't occur when decoding from a string *)inaux[](Uutf.decoder(`Strings))letdistance(s:string)(t:string):int=(* Convert UTF-8 strings to arrays of Unicode code points *)lets_chars=decode_utf8_arraysinlett_chars=decode_utf8_arraytinletm=Array.lengths_charsinletn=Array.lengtht_charsin(* Initialize DP (dynamic programming) matrix: dp.(i).(j) represents the edit
distance between the first [i] characters of [s] and the first [j]
characters of [t] *)letdp=Array.make_matrix(m+1)(n+1)0in(* Base cases: transforming to/from an empty string *)fori=0tomdodp.(i).(0)<-idone;forj=0tondodp.(0).(j)<-jdone;(* Main DP loop to compute minimal edit distance *)fori=1tomdoforj=1tondoletcost=ifUchar.equals_chars.(i-1)t_chars.(j-1)then0else1indp.(i).(j)<-min(min(dp.(i-1).(j)+1)(* Deletion *)(dp.(i).(j-1)+1))(* Insertion *)(dp.(i-1).(j-1)+cost)(* Substitution *)donedone;(* The final cell contains the edit distance between the full strings *)dp.(m).(n)