123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228openTreeexceptionSyntax_errorofstring(* |print| -- prints string representation of re *)letprintre=letrecstringify_ast=function|Literala->a|Epsilon->"ε"|Union(r1,r2)->"("^stringify_astr1^" + "^stringify_astr2^")"|Concat(r1,r2)->"("^stringify_astr1^" . "^stringify_astr2^")"|Starr1->stringify_astr1^"*"|Empty->"∅"inprint_string(stringify_astre);print_newline()(* |export_graphviz| -- exports the AST in the DOT language for Graphviz *)letexport_graphvizre=letcount=ref0inletrecgraphvizifyparent=function|Literala->incrcount;string_of_int!count^" [label=\""^a^"\", shape=ellipse, ];\n"^string_of_intparent^" -> "^string_of_int!count^"[label=\"\", ];\n"|Epsilon->incrcount;string_of_int!count^" [label=\"ε\", shape=ellipse, ];\n"^string_of_intparent^" -> "^string_of_int!count^"[label=\"\", ];\n"|Union(r1,r2)->incrcount;letc=!countingraphvizifycr1^graphvizifycr2^string_of_intc^" [label=\"Union\", shape=ellipse, ];\n"^string_of_intparent^" -> "^string_of_intc^"[label=\"\", ];\n"|Concat(r1,r2)->incrcount;letc=!countingraphvizifycr1^graphvizifycr2^string_of_intc^" [label=\"Concat\", shape=ellipse, ];\n"^string_of_intparent^" -> "^string_of_intc^"[label=\"\", ];\n"|Starr1->incrcount;letc=!countingraphvizifycr1^string_of_intc^" [label=\"Star\", shape=ellipse, ];\n"^string_of_intparent^" -> "^string_of_intc^"[label=\"\", ];\n"|Empty->incrcount;string_of_int!count^" [label=\"∅\", shape=ellipse, ];\n"^string_of_intparent^" -> "^string_of_int!count^"[label=\"\", ];\n"in"digraph G {\n0 [label=\"\", shape=none, height=0, width=0, ]\n"^graphvizify0re^"}"(* |is_subset_of| -- is L(r1) a subset of L(r2)? *)(* let _is_subset_of r1 r2 =
let n1 = Nfa.re_to_nfa r1 and
n2 = Nfa.re_to_nfa r2 in
let (n1', n2') = Nfa.merge_alphabets n1 n2 in
let d1 = Dfa.nfa_to_dfa n1' and
d2 = Dfa.nfa_to_dfa n2' in
let notd2 = Dfa.complement d2 in
Dfa.is_empty (Dfa.product_intersection d1 notd2) *)(* |get_alphabet| -- returns the alphabet of the RE *)letrecget_alphabet=function|Literala->[a]|Epsilon|Empty->[]|Union(r1,r2)|Concat(r1,r2)->Utils.list_union(get_alphabetr1)(get_alphabetr2)|Starr1->get_alphabetr1letis_literal=functionLiteral_|Epsilon|Empty->true|_->false(* Recurses through tree of unions to find any repeated a (or starred) *)letreccontainsare=ifre=athentrueelsematchrewith|Union(r1,r2)->containsar1||containsar2|Starr1->ifa=Epsilonthentrueelser1=a|_->falseletreccontainsNonLit=function|Union(r1,r2)->containsNonLitr1||containsNonLitr2|Epsilon|Empty|Concat(_,_)|Star_->true|_->false(* checks if re is just w^n (n>0) *)letrecrepeatedwre=ifre=wthentrueelsematchrewith|Concat(r1,r2)->repeatedwr1&&repeatedwr2|_->false(* |simplify_re| -- recursively simplifies the regex *)letrecsimplify_re=function(* Reduce by Kozen Axioms *)|Union(Union(r1,r2),r3)->simplify_re(Union(r1,Union(r2,r3)))(* (a + b) + c = a + (b + c) *)|Union(r1,Empty)->simplify_rer1(* a + ∅ = a *)|Union(Empty,r1)->simplify_rer1(* ∅ + a = a *)|Concat(Concat(r1,r2),r3)->simplify_re(Concat(r1,Concat(r2,r3)))(* (a.b).c = a.(b.c) *)|Concat(Epsilon,r1)->simplify_rer1(* ε.a = a *)|Concat(r1,Epsilon)->simplify_rer1(* a.ε = a *)|Union(Concat(r1,r2),Concat(r3,r4))whenr1=r3->simplify_re(Concat(r1,Union(r2,r4)))(* ab + ac = a(b+c) *)|Union(Concat(r1,r2),Concat(r3,r4))whenr2=r4->simplify_re(Concat(Union(r1,r3),r2))(* ac + bc = (a+b)c *)|Concat(Empty,_)->Empty(* ∅.a = ∅ *)|Concat(_,Empty)->Empty(* a.∅ = ∅ *)|Union(Epsilon,Concat(r1,Starr2))whenr1=r2->simplify_re(Starr1)(* ε + aa* = a* *)(* Order Unions lexicographically (for literals) *)|Union(a,Epsilon)whena<>Epsilon->simplify_re(Union(Epsilon,a))|Union(r1,Union(Epsilon,r2))whenr1<>Epsilon->simplify_re(Union(Epsilon,Union(r1,r2)))|Union(Literalr1,Union(Literalr2,r3))whenr2<r1->simplify_re(Union(Literalr2,Union(Literalr1,r3)))|Union(Literalr1,Literalr2)whenr2<r1->simplify_re(Union(Literalr2,Literalr1))|Union(r1,Union(Literalr2,r3))whennot(is_literalr1)->simplify_re(Union(Literalr2,Union(r1,r3)))|Union(r1,Literalr2)whennot(is_literalr1)->simplify_re(Union(Literalr2,r1))(* other reductions *)|Concat(Union(Epsilon,r1),Starr2)whenr1=r2->simplify_re(Starr1)(* (ε + a)a* = a* *)|Concat(Starr1,Union(Epsilon,r2))whenr1=r2->simplify_re(Starr1)(* a*(ε + a) = a* *)|Concat(r1,Concat(Union(Epsilon,r2),Starr3))whenr2=r3->simplify_re(Concat(r1,Starr2))(* a.((ε+b).b* ) = ab* *)|Star(Concat(Starr1,Starr2))->simplify_re(Star(Union(r1,r2)))(* ( a*b* )* = (a + b)* *)|Concat(Starr1,r2)whenr1=r2->simplify_re(Concat(r1,Starr1))(* a*a = aa* *)|Concat(Starr1,Concat(r2,r3))whenr1=r2->simplify_re(Concat(r1,Concat(Starr2,r3)))(* a*(ab) = a(a*b) *)|Star(Starr1)->simplify_re(Starr1)(* ( a* )* = a* *)|StarEmpty->Epsilon(* ∅* = ε *)|StarEpsilon->Epsilon(* ε* = ε *)|Union(r1,r2)whencontainsr1r2->simplify_rer2(* a + ... + (a + b) = ... + a + b OR a + ... + a* = ... + a* *)|Union(r1,r2)whencontainsr2r1->simplify_rer1|Union(r1,Starr2)whenrepeatedr2r1->simplify_re(Starr2)(* aa...a + a* = a* *)|Union(Starr1,r2)whenrepeatedr1r2->simplify_re(Starr1)(* a* + aa...a = a* *)|Concat(Starr1,Starr2)whencontainsr1r2->simplify_re(Starr2)(* a*b* = b* if a <= b *)|Concat(Starr1,Starr2)whencontainsr2r1->simplify_re(Starr1)(* a*b* = a* if b <= a *)|Concat(Starr1,Concat(Starr2,r3))whencontainsr1r2->simplify_re(Concat(Starr2,r3))(* a*(b*c) = b*c if a <= b *)|Concat(Starr1,Concat(Starr2,r3))whencontainsr2r1->simplify_re(Concat(Starr1,r3))(* a*(b*c) = a*c if b <= a *)|Starr1whenletalph=get_alphabetr1inList.lengthalph>0&&containsNonLitr1&&List.for_all(funa->contains(Literala)r1)alph->letalph=get_alphabetr1insimplify_re(Star(List.fold_right(funaacc->Union(Literala,acc))(List.tlalph)(Literal(List.hdalph))))(* REMOVED since they use DFA conversion for language checking...
(* More complex reductions, language based *)
| Union (r1, r2) when is_subset_of r1 r2 -> simplify_re r2 (* a + b = b if a <= b *)
| Union (r1, r2) when is_subset_of r2 r1 -> simplify_re r1 (* a + b = a if b <= a *)
| Concat (Star r1, Star r2) when is_subset_of r1 r2 -> simplify_re (Star r2) (* a*b* = b* if a <= b *)
| Concat (Star r1, Star r2) when is_subset_of r2 r1 -> simplify_re (Star r1) (* a*b* = a* if b <= a *)
| Concat (Concat (r1, Star r2), Star r3) when is_subset_of r2 r3 -> simplify_re (Concat (r1, Star r3)) (* ( ab* ) c* = ac* if b <= c *)
| Concat (Concat (r1, Star r2), Star r3) when is_subset_of r3 r2 -> simplify_re (Concat (r1, Star r2)) (* ( ab* ) c* = ab* if c <= b *)
| Star (Union (r1, r2)) when is_subset_of r2 (Star r1) -> simplify_re (Star r1) (* (a+b)* = a* if b <= a* *)
| Star (Union (r1, r2)) when is_subset_of r1 (Star r2) -> simplify_re (Star r2) (* (a+b)* = b* if a <= b* *)
*)(* otherwise, simplify children *)|Literala->Literala|Epsilon->Epsilon|Union(r1,r2)->Union(simplify_rer1,simplify_rer2)|Concat(r1,r2)->Concat(simplify_rer1,simplify_rer2)|Starr1->Star(simplify_rer1)|Empty->Empty(* |simplify| -- simplifies input regex. Repeats until no more changes *)letsimplifyre=letr=refreandnewr=ref(simplify_rere)inwhile!r<>!newrdor:=!newr;newr:=simplify_re!rdone;!r(* |is_nullable| -- returns true if RE contains ε *)letrecis_nullable=function|Epsilon|Star_->true|Literal_|Empty->false|Union(r1,r2)->is_nullabler1||is_nullabler2|Concat(r1,r2)->is_nullabler1&&is_nullabler2(* |derivative| -- returns the Brzozowski derivative w.r.t w *)letrecderivativerew=matchrewith|Literalawhenw=a->Epsilon|Literal_|Epsilon|Empty->Empty|Starr->Concat(derivativerw,Starr)|Union(r1,r2)->Union(derivativer1w,derivativer2w)|Concat(r1,r2)whenis_nullabler1->Union(Concat(derivativer1w,r2),derivativer2w)|Concat(r1,r2)->Concat(derivativer1w,r2)(* |parse| -- converts string into AST representation *)letparses=letlexbuf=Lexing.from_stringsintryParser.regexLexer.tokenlexbufwithParsing.Parse_error->lettok=Lexing.lexemelexbufinraise(Syntax_error("Syntax Error at token "^tok))