123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356(* This files comes from camlp5 (ocaml_src/lib/diff.ml). *)(*
* Copyright (c) 2007-2013, INRIA (Institut National de Recherches en
* Informatique et Automatique). All rights reserved.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of INRIA, nor the names of its contributors may be
* used to endorse or promote products derived from this software without
* specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY INRIA AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
* THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
* PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INRIA AND
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*)(* $Id: diff.ml,v 1.2 2013-02-26 08:15:06 deraugla Exp $ *)(* Parts of Code of GNU diff (diffseq.h and analyze.c) translated to OCaml and adjusted.
Basic algorithm described by Eugene W.Myers in: "An O(ND) Difference Algorithm and Its
Variations" *)openBase(* A partition is the midpoint of the shortest edit script for a specified portion of two
vectors.
[xmid, ymid] is the midpoint discovered. The diagonal number [xmid - ymid] equals the
number of inserted elements minus the number of deleted elements (counting only
elements before the midpoint).
[lo_minimal] is true iff the minimal edit script for the left half of the partition is
known; similarly for [hi_minimal].
*)modulePartition=structtypet={xmid:int;ymid:int;lo_minimal:bool;hi_minimal:bool}end(* We keep this file in a C-like style so that we can easily compare against the original
C, in which we have great confidence. *)(* Find the midpoint of the shortest edit script for a specified portion of the two
vectors.
Scan from the beginnings of the vectors, and simultaneously from the ends, doing a
breadth-first search through the space of edit-sequence. When the two searches meet, we
have found the midpoint of the shortest edit sequence.
If [find_minimal] is true, find the minimal edit script regardless of expense.
Otherwise, if the search is too expensive, use heuristics to stop the search and report
a suboptimal answer.
This function assumes that the first elements of the specified portions of the two
vectors do not match, and likewise that the last elements do not match. The caller must
trim matching elements from the beginning and end of the portions it is going to
specify.
If we return the "wrong" partitions, the worst this can do is cause suboptimal diff
output. It cannot cause incorrect diff output. *)letdiag~fd~bd~sh~xv~yv~xoff~xlim~yoff~ylim~too_expensive~find_minimal:Partition.t=letdmin=xoff-ylim(* minimum valid diagonal *)inletdmax=xlim-yoff(* maximum valid diagonal *)inletfmid=xoff-yoff(* center diagonal of forward search *)inletbmid=xlim-ylim(* center diagonal of backward search *)in(* southeast corner is on an odd diagonal w.r.t the northwest *)letodd=(fmid-bmid)land1<>0in(* [sh] is an offset that lets us use indices in [[-(m+1), n+1]]. *)fd.(sh+fmid)<-xoff;bd.(sh+bmid)<-xlim;With_return.with_return(fun({return}:Partition.tWith_return.return)->(* [c] is cost.
[fmin], [fmax] are limits of the forward search.
[bmin], [bmax] are limits of the backward search. *)letrecloop~c~fmin~fmax~bmin~bmax=(* Extend the forward search by one edit step in each diagonal. *)letfmin=iffmin>dminthen(fd.(sh+fmin-2)<--1;fmin-1)elsefmin+1inletfmax=iffmax<dmaxthen(fd.(sh+fmax+2)<--1;fmax+1)elsefmax-1in(* [d] is the active diagonal. *)(letrecloopd=ifd<fminthen()else(lettlo=fd.(sh+d-1)inletthi=fd.(sh+d+1)inletx=iftlo>=thithentlo+1elsethiinletx,y=letrecloop~xv~yv~xlim~ylim~x~y=ifx<xlim&&y<ylim&&phys_equal(xvx)(yvy)thenloop~xv~yv~xlim~ylim~x:(x+1)~y:(y+1)elsex,yinloop~xv~yv~xlim~ylim~x~y:(x-d)infd.(sh+d)<-x;ifodd&&bmin<=d&&d<=bmax&&bd.(sh+d)<=fd.(sh+d)thenreturn{xmid=x;ymid=y;lo_minimal=true;hi_minimal=true}elseloop(d-2))inloopfmax);(* Similarly extend the backward search. *)letbmin=ifbmin>dminthen(bd.(sh+bmin-2)<-Int.max_value;bmin-1)elsebmin+1inletbmax=ifbmax<dmaxthen(bd.(sh+bmax+2)<-Int.max_value;bmax+1)elsebmax-1in(letrecloopd=ifd<bminthen()else(lettlo=bd.(sh+d-1)inletthi=bd.(sh+d+1)inletx=iftlo<thithentloelsethi-1inletx,y=letrecloop~xv~yv~xoff~yoff~x~y=ifx>xoff&&y>yoff&&phys_equal(xv(x-1))(yv(y-1))thenloop~xv~yv~xoff~yoff~x:(x-1)~y:(y-1)elsex,yinloop~xv~yv~xoff~yoff~x~y:(x-d)inbd.(sh+d)<-x;ifnotodd&&fmin<=d&&d<=fmax&&bd.(sh+d)<=fd.(sh+d)thenreturn{xmid=x;ymid=y;lo_minimal=true;hi_minimal=true}elseloop(d-2))inloopbmax);(* Heuristic: if we've gone well beyond the call of duty, give up and report halfway
between our best results so far. *)ifnotfind_minimal&&c>=too_expensivethen((* Find forward diagonal that maximizes [x + y]. *)letfxybest,fxbest=letrecloop~d~fxybest~fxbest=ifd<fminthenfxybest,fxbestelse(letx=Int.minfd.(sh+d)xliminlety=x-dinletx,y=ifylim<ythenylim+d,ylimelsex,yinletfxybest,fxbest=iffxybest<x+ythenx+y,xelsefxybest,fxbestinloop~d:(d-2)~fxybest~fxbest)inloop~d:fmax~fxybest:(-1)~fxbest:fmaxin(* Find backward diagonal that minimizes [x + y]. *)letbxybest,bxbest=letrecloop~d~bxybest~bxbest=ifd<bminthenbxybest,bxbestelse(letx=Int.maxxoffbd.(sh+d)inlety=x-dinletx,y=ify<yoffthenyoff+d,yoffelsex,yinletbxybest,bxbest=ifx+y<bxybestthenx+y,xelsebxybest,bxbestinloop~d:(d-2)~bxybest~bxbest)inloop~d:bmax~bxybest:Int.max_value~bxbest:bmaxinifxlim+ylim-bxybest<fxybest-(xoff+yoff)thenreturn{xmid=fxbest;ymid=fxybest-fxbest;lo_minimal=true;hi_minimal=false}elsereturn{xmid=bxbest;ymid=bxybest-bxbest;lo_minimal=false;hi_minimal=true})elseloop~c:(c+1)~fmin~fmax~bmin~bmaxinloop~c:1~fmin:fmid~fmax:fmid~bmin:bmid~bmax:bmid);;letdiff_loop~cutoffaaibbinm=letfd=Array.create~len:(n+m+3)0inletbd=Array.create~len:(n+m+3)0inletsh=m+1inlettoo_expensive=matchcutoffwith|Somec->c|None->letdiags=n+m+3inletrecloopdiagstoo_expensive=ifdiags=0thentoo_expensiveelseloop(diagsasr2)(too_expensivelsl1)inInt.max4096(loopdiags1)inletxveci=a.(ai.(i))inletyvecj=b.(bi.(j))inletchng1=Array.create~len:(Array.lengtha)trueinletchng2=Array.create~len:(Array.lengthb)trueinfori=0ton-1dochng1.(ai.(i))<-falsedone;forj=0tom-1dochng2.(bi.(j))<-falsedone;letrecloop~xoff~xlim~yoff~ylim~find_minimal=letxoff,yoff=letrecloop~xoff~yoff=ifxoff<xlim&&yoff<ylim&&phys_equal(xvecxoff)(yvecyoff)thenloop~xoff:(xoff+1)~yoff:(yoff+1)elsexoff,yoffinloop~xoff~yoffinletxlim,ylim=letrecloop~xlim~ylim=ifxlim>xoff&&ylim>yoff&&phys_equal(xvec(xlim-1))(yvec(ylim-1))thenloop~xlim:(xlim-1)~ylim:(ylim-1)elsexlim,yliminloop~xlim~yliminifxoff=xlimthenfory=yofftoylim-1dochng2.(bi.(y))<-truedoneelseifyoff=ylimthenforx=xofftoxlim-1dochng1.(ai.(x))<-truedoneelse(let{Partition.xmid;ymid;lo_minimal;hi_minimal}=diag~fd~bd~sh~xv:xvec~yv:yvec~xoff~xlim~yoff~ylim~too_expensive~find_minimalinloop~xoff~xlim:xmid~yoff~ylim:ymid~find_minimal:lo_minimal;loop~xoff:xmid~xlim~yoff:ymid~ylim~find_minimal:hi_minimal)inloop~xoff:0~xlim:n~yoff:0~ylim:m~find_minimal:false;chng1,chng2;;(* [make_indexer a b] returns an array of the indices of items of [a] which are also
present in [b]; this way, the main algorithm can skip items which, anyway, are
different. This improves the speed much. At the same time, this function updates the
items of [a] and [b] so that all equal items point to the same unique item. All item
comparisons in the main algorithm can therefore be done with [phys_equal] instead of
[=], which can improve speed much. *)letmake_indexerhashableab=letn=Array.lengthainlethtb=Hashtbl.createhashable~size:(10*Array.lengthb)inArray.iteri~f:(funie->matchHashtbl.findhtbewith|Somev->b.(i)<-v|None->Hashtbl.add_exnhtb~key:e~data:e)b;letai=Array.create~len:n0inletk=letrecloopik=ifi=nthenkelse(letk=matchHashtbl.findhtba.(i)with|Somev->a.(i)<-v;ai.(k)<-i;k+1|None->kinloop(i+1)k)inloop00inArray.subai~pos:0~len:k;;letf~cutoff~hashableab=letai=make_indexerhashableabinletbi=make_indexerhashablebainletn=Array.lengthaiinletm=Array.lengthbiindiff_loop~cutoffaaibbinm;;letiter_matches?cutoff~f:ff~hashableab=letd1,d2=f~cutoff~hashableabinletrecauxi1i2=ifi1>=Array.lengthd1||i2>=Array.lengthd2then()elseifnotd1.(i1)thenifnotd2.(i2)then(ff(i1,i2);aux(i1+1)(i2+1))elseauxi1(i2+1)elseifnotd2.(i2)thenaux(i1+1)i2elseaux(i1+1)(i2+1)inaux00;;