123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399(* The package sedlex is released under the terms of an MIT-like license. *)(* See the attached LICENSE file. *)(* Copyright 2005, 2013 by Alain Frisch and LexiFi. *)(* Character sets are represented as lists of intervals. The
intervals must be non-overlapping and not collapsable, and the list
must be ordered in increasing order. *)typet=(int*int)listletmax_code=0x10ffff(* must be < max_int *)letmin_code=-1letempty=[]letsingletoni=[i,i]letis_empty=function[]->true|_->falseletintervalij=ifi<=jthen[i,j]else[j,i]leteof=singleton(-1)letany=interval0max_codeletrecunionc1c2=matchc1,c2with|[],_->c2|_,[]->c1|((i1,j1)ass1)::r1,(i2,j2)::r2->if(i1<=i2)thenifj1+1<i2thens1::(unionr1c2)elseif(j1<j2)thenunionr1((i1,j2)::r2)elseunionc1r2elseunionc2c1letcomplementc=letrecauxstart=function|[]->ifstart<=max_codethen[start,max_code]else[]|(i,j)::l->(start,i-1)::(aux(succj)l)inmatchcwith|(-1,j)::l->aux(succj)l|l->aux(-1)lletintersectionc1c2=complement(union(complementc1)(complementc2))letdifferencec1c2=complement(union(complementc1)c2)(* Unicode classes from XML *)letbase_char=[0x0041,0x005A;0x0061,0x007A;0x00C0,0x00D6;0x00D8,0x00F6;0x00F8,0x00FF;0x0100,0x0131;0x0134,0x013E;0x0141,0x0148;0x014A,0x017E;0x0180,0x01C3;0x01CD,0x01F0;0x01F4,0x01F5;0x01FA,0x0217;0x0250,0x02A8;0x02BB,0x02C1;0x0386,0x0386;0x0388,0x038A;0x038C,0x038C;0x038E,0x03A1;0x03A3,0x03CE;0x03D0,0x03D6;0x03DA,0x03DA;0x03DC,0x03DC;0x03DE,0x03DE;0x03E0,0x03E0;0x03E2,0x03F3;0x0401,0x040C;0x040E,0x044F;0x0451,0x045C;0x045E,0x0481;0x0490,0x04C4;0x04C7,0x04C8;0x04CB,0x04CC;0x04D0,0x04EB;0x04EE,0x04F5;0x04F8,0x04F9;0x0531,0x0556;0x0559,0x0559;0x0561,0x0586;0x05D0,0x05EA;0x05F0,0x05F2;0x0621,0x063A;0x0641,0x064A;0x0671,0x06B7;0x06BA,0x06BE;0x06C0,0x06CE;0x06D0,0x06D3;0x06D5,0x06D5;0x06E5,0x06E6;0x0905,0x0939;0x093D,0x093D;0x0958,0x0961;0x0985,0x098C;0x098F,0x0990;0x0993,0x09A8;0x09AA,0x09B0;0x09B2,0x09B2;0x09B6,0x09B9;0x09DC,0x09DD;0x09DF,0x09E1;0x09F0,0x09F1;0x0A05,0x0A0A;0x0A0F,0x0A10;0x0A13,0x0A28;0x0A2A,0x0A30;0x0A32,0x0A33;0x0A35,0x0A36;0x0A38,0x0A39;0x0A59,0x0A5C;0x0A5E,0x0A5E;0x0A72,0x0A74;0x0A85,0x0A8B;0x0A8D,0x0A8D;0x0A8F,0x0A91;0x0A93,0x0AA8;0x0AAA,0x0AB0;0x0AB2,0x0AB3;0x0AB5,0x0AB9;0x0ABD,0x0ABD;0x0AE0,0x0AE0;0x0B05,0x0B0C;0x0B0F,0x0B10;0x0B13,0x0B28;0x0B2A,0x0B30;0x0B32,0x0B33;0x0B36,0x0B39;0x0B3D,0x0B3D;0x0B5C,0x0B5D;0x0B5F,0x0B61;0x0B85,0x0B8A;0x0B8E,0x0B90;0x0B92,0x0B95;0x0B99,0x0B9A;0x0B9C,0x0B9C;0x0B9E,0x0B9F;0x0BA3,0x0BA4;0x0BA8,0x0BAA;0x0BAE,0x0BB5;0x0BB7,0x0BB9;0x0C05,0x0C0C;0x0C0E,0x0C10;0x0C12,0x0C28;0x0C2A,0x0C33;0x0C35,0x0C39;0x0C60,0x0C61;0x0C85,0x0C8C;0x0C8E,0x0C90;0x0C92,0x0CA8;0x0CAA,0x0CB3;0x0CB5,0x0CB9;0x0CDE,0x0CDE;0x0CE0,0x0CE1;0x0D05,0x0D0C;0x0D0E,0x0D10;0x0D12,0x0D28;0x0D2A,0x0D39;0x0D60,0x0D61;0x0E01,0x0E2E;0x0E30,0x0E30;0x0E32,0x0E33;0x0E40,0x0E45;0x0E81,0x0E82;0x0E84,0x0E84;0x0E87,0x0E88;0x0E8A,0x0E8A;0x0E8D,0x0E8D;0x0E94,0x0E97;0x0E99,0x0E9F;0x0EA1,0x0EA3;0x0EA5,0x0EA5;0x0EA7,0x0EA7;0x0EAA,0x0EAB;0x0EAD,0x0EAE;0x0EB0,0x0EB0;0x0EB2,0x0EB3;0x0EBD,0x0EBD;0x0EC0,0x0EC4;0x0F40,0x0F47;0x0F49,0x0F69;0x10A0,0x10C5;0x10D0,0x10F6;0x1100,0x1100;0x1102,0x1103;0x1105,0x1107;0x1109,0x1109;0x110B,0x110C;0x110E,0x1112;0x113C,0x113C;0x113E,0x113E;0x1140,0x1140;0x114C,0x114C;0x114E,0x114E;0x1150,0x1150;0x1154,0x1155;0x1159,0x1159;0x115F,0x1161;0x1163,0x1163;0x1165,0x1165;0x1167,0x1167;0x1169,0x1169;0x116D,0x116E;0x1172,0x1173;0x1175,0x1175;0x119E,0x119E;0x11A8,0x11A8;0x11AB,0x11AB;0x11AE,0x11AF;0x11B7,0x11B8;0x11BA,0x11BA;0x11BC,0x11C2;0x11EB,0x11EB;0x11F0,0x11F0;0x11F9,0x11F9;0x1E00,0x1E9B;0x1EA0,0x1EF9;0x1F00,0x1F15;0x1F18,0x1F1D;0x1F20,0x1F45;0x1F48,0x1F4D;0x1F50,0x1F57;0x1F59,0x1F59;0x1F5B,0x1F5B;0x1F5D,0x1F5D;0x1F5F,0x1F7D;0x1F80,0x1FB4;0x1FB6,0x1FBC;0x1FBE,0x1FBE;0x1FC2,0x1FC4;0x1FC6,0x1FCC;0x1FD0,0x1FD3;0x1FD6,0x1FDB;0x1FE0,0x1FEC;0x1FF2,0x1FF4;0x1FF6,0x1FFC;0x2126,0x2126;0x212A,0x212B;0x212E,0x212E;0x2180,0x2182;0x3041,0x3094;0x30A1,0x30FA;0x3105,0x312C;0xAC00,0xD7A3]letideographic=[0x3007,0x3007;0x3021,0x3029;0x4E00,0x9FA5]letcombining_char=[0x0300,0x0345;0x0360,0x0361;0x0483,0x0486;0x0591,0x05A1;0x05A3,0x05B9;0x05BB,0x05BD;0x05BF,0x05BF;0x05C1,0x05C2;0x05C4,0x05C4;0x064B,0x0652;0x0670,0x0670;0x06D6,0x06DC;0x06DD,0x06DF;0x06E0,0x06E4;0x06E7,0x06E8;0x06EA,0x06ED;0x0901,0x0903;0x093C,0x093C;0x093E,0x094C;0x094D,0x094D;0x0951,0x0954;0x0962,0x0963;0x0981,0x0983;0x09BC,0x09BC;0x09BE,0x09BE;0x09BF,0x09BF;0x09C0,0x09C4;0x09C7,0x09C8;0x09CB,0x09CD;0x09D7,0x09D7;0x09E2,0x09E3;0x0A02,0x0A02;0x0A3C,0x0A3C;0x0A3E,0x0A3E;0x0A3F,0x0A3F;0x0A40,0x0A42;0x0A47,0x0A48;0x0A4B,0x0A4D;0x0A70,0x0A71;0x0A81,0x0A83;0x0ABC,0x0ABC;0x0ABE,0x0AC5;0x0AC7,0x0AC9;0x0ACB,0x0ACD;0x0B01,0x0B03;0x0B3C,0x0B3C;0x0B3E,0x0B43;0x0B47,0x0B48;0x0B4B,0x0B4D;0x0B56,0x0B57;0x0B82,0x0B83;0x0BBE,0x0BC2;0x0BC6,0x0BC8;0x0BCA,0x0BCD;0x0BD7,0x0BD7;0x0C01,0x0C03;0x0C3E,0x0C44;0x0C46,0x0C48;0x0C4A,0x0C4D;0x0C55,0x0C56;0x0C82,0x0C83;0x0CBE,0x0CC4;0x0CC6,0x0CC8;0x0CCA,0x0CCD;0x0CD5,0x0CD6;0x0D02,0x0D03;0x0D3E,0x0D43;0x0D46,0x0D48;0x0D4A,0x0D4D;0x0D57,0x0D57;0x0E31,0x0E31;0x0E34,0x0E3A;0x0E47,0x0E4E;0x0EB1,0x0EB1;0x0EB4,0x0EB9;0x0EBB,0x0EBC;0x0EC8,0x0ECD;0x0F18,0x0F19;0x0F35,0x0F35;0x0F37,0x0F37;0x0F39,0x0F39;0x0F3E,0x0F3E;0x0F3F,0x0F3F;0x0F71,0x0F84;0x0F86,0x0F8B;0x0F90,0x0F95;0x0F97,0x0F97;0x0F99,0x0FAD;0x0FB1,0x0FB7;0x0FB9,0x0FB9;0x20D0,0x20DC;0x20E1,0x20E1;0x302A,0x302F;0x3099,0x3099;0x309A,0x309A]letdigit=[0x0030,0x0039;0x0660,0x0669;0x06F0,0x06F9;0x0966,0x096F;0x09E6,0x09EF;0x0A66,0x0A6F;0x0AE6,0x0AEF;0x0B66,0x0B6F;0x0BE7,0x0BEF;0x0C66,0x0C6F;0x0CE6,0x0CEF;0x0D66,0x0D6F;0x0E50,0x0E59;0x0ED0,0x0ED9;0x0F20,0x0F29]letextender=[0x00B7,0x00B7;0x02D0,0x02D1;0x0387,0x0387;0x0640,0x0640;0x0E46,0x0E46;0x0EC6,0x0EC6;0x3005,0x3005;0x3031,0x3035;0x309D,0x309E;0x30FC,0x30FE]letblank=[0x0009,0x000A;0x000D,0x000D;0x0020,0x0020]letletter=unionbase_charideographic(* Letters to be used in identifiers, as specified
by ISO ....
Data provided by John M. Skaller *)lettr8876_ident_char=[(* ASCII *)(0x0041,0x005a);(0x0061,0x007a);(* Latin *)(0x00c0,0x00d6);(0x00d8,0x00f6);(0x00f8,0x01f5);(0x01fa,0x0217);(0x0250,0x02a8);(* Greek *)(0x0384,0x0384);(0x0388,0x038a);(0x038c,0x038c);(0x038e,0x03a1);(0x03a3,0x03ce);(0x03d0,0x03d6);(0x03da,0x03da);(0x03dc,0x03dc);(0x03de,0x03de);(0x03e0,0x03e0);(0x03e2,0x03f3);(* Cyrillic *)(0x0401,0x040d);(0x040f,0x044f);(0x0451,0x045c);(0x045e,0x0481);(0x0490,0x04c4);(0x04c7,0x04c4);(0x04cb,0x04cc);(0x04d0,0x04eb);(0x04ee,0x04f5);(0x04f8,0x04f9);(* Armenian *)(0x0531,0x0556);(0x0561,0x0587);(0x04d0,0x04eb);(* Hebrew *)(0x05d0,0x05ea);(0x05f0,0x05f4);(* Arabic *)(0x0621,0x063a);(0x0640,0x0652);(0x0670,0x06b7);(0x06ba,0x06be);(0x06c0,0x06ce);(0x06e5,0x06e7);(* Devanagari *)(0x0905,0x0939);(0x0958,0x0962);(* Bengali *)(0x0985,0x098c);(0x098f,0x0990);(0x0993,0x09a8);(0x09aa,0x09b0);(0x09b2,0x09b2);(0x09b6,0x09b9);(0x09dc,0x09dd);(0x09df,0x09e1);(0x09f0,0x09f1);(* Gurmukhi *)(0x0a05,0x0a0a);(0x0a0f,0x0a10);(0x0a13,0x0a28);(0x0a2a,0x0a30);(0x0a32,0x0a33);(0x0a35,0x0a36);(0x0a38,0x0a39);(0x0a59,0x0a5c);(0x0a5e,0x0a5e);(* Gunjarati *)(0x0a85,0x0a8b);(0x0a8d,0x0a8d);(0x0a8f,0x0a91);(0x0a93,0x0aa8);(0x0aaa,0x0ab0);(0x0ab2,0x0ab3);(0x0ab5,0x0ab9);(0x0ae0,0x0ae0);(* Oriya *)(0x0b05,0x0b0c);(0x0b0f,0x0b10);(0x0b13,0x0b28);(0x0b2a,0x0b30);(0x0b32,0x0b33);(0x0b36,0x0b39);(0x0b5c,0x0b5d);(0x0b5f,0x0b61);(* Tamil *)(0x0b85,0x0b8a);(0x0b8e,0x0b90);(0x0b92,0x0b95);(0x0b99,0x0b9a);(0x0b9c,0x0b9c);(0x0b9e,0x0b9f);(0x0ba3,0x0ba4);(0x0ba8,0x0baa);(0x0bae,0x0bb5);(0x0bb7,0x0bb9);(* Telugu *)(0x0c05,0x0c0c);(0x0c0e,0x0c10);(0x0c12,0x0c28);(0x0c2a,0x0c33);(0x0c35,0x0c39);(0x0c60,0x0c61);(* Kannada *)(0x0c85,0x0c8c);(0x0c8e,0x0c90);(0x0c92,0x0ca8);(0x0caa,0x0cb3);(0x0cb5,0x0cb9);(0x0ce0,0x0ce1);(* Malayam *)(0x0d05,0x0d0c);(0x0d0e,0x0d10);(0x0d12,0x0d28);(0x0d2a,0x0d39);(0x0d60,0x0d61);(* Thai *)(0x0e01,0x0e30);(0x0e32,0x0e33);(0x0e40,0x0e46);(0x0e4f,0x0e5b);(* Lao *)(0x0e81,0x0e82);(0x0e84,0x0e84);(0x0e87,0x0e88);(0x0e8a,0x0e8a);(0x0e0d,0x0e0d);(0x0e94,0x0e97);(0x0e99,0x0e9f);(0x0ea1,0x0ea3);(0x0ea5,0x0ea5);(0x0ea7,0x0ea7);(0x0eaa,0x0eab);(0x0ead,0x0eb0);(0x0eb2,0x0eb3);(0x0ebd,0x0ebd);(0x0ec0,0x0ec4);(0x0ec6,0x0ec6);(* Georgian *)(0x10a0,0x10c5);(0x10d0,0x10f6);(* Hangul Jamo *)(0x1100,0x1159);(0x1161,0x11a2);(0x11a8,0x11f9);(0x11d0,0x11f6);(* Latin extensions *)(0x1e00,0x1e9a);(0x1ea0,0x1ef9);(* Greek extended *)(0x1f00,0x1f15);(0x1f18,0x1f1d);(0x1f20,0x1f45);(0x1f48,0x1f4d);(0x1f50,0x1f57);(0x1f59,0x1f59);(0x1f5b,0x1f5b);(0x1f5d,0x1f5d);(0x1f5f,0x1f7d);(0x1f80,0x1fb4);(0x1fb6,0x1fbc);(0x1fc2,0x1fc4);(0x1fc6,0x1fcc);(0x1fd0,0x1fd3);(0x1fd6,0x1fdb);(0x1fe0,0x1fec);(0x1ff2,0x1ff4);(0x1ff6,0x1ffc);(* Hiragana *)(0x3041,0x3094);(0x309b,0x309e);(* Katakana *)(0x30a1,0x30fe);(* Bopmofo *)(0x3105,0x312c);(* CJK Unified Ideographs *)(0x4e00,0x9fa5);(* CJK Compatibility Ideographs *)(0xf900,0xfa2d);(* Arabic Presentation Forms *)(0xfb1f,0xfb36);(0xfb38,0xfb3c);(0xfb3e,0xfb3e);(0xfb40,0xfb41);(0xfb42,0xfb44);(0xfb46,0xfbb1);(0xfbd3,0xfd35);(* Arabic Presentation Forms-A *)(0xfd50,0xfd85);(0xfd92,0xfbc7);(0xfdf0,0xfdfb);(* Arabic Presentation Forms-B *)(0xfe70,0xfe72);(0xfe74,0xfe74);(0xfe76,0xfefc);(* Half width and Fullwidth Forms *)(0xff21,0xff3a);(0xff41,0xff5a);(0xff66,0xffbe);(0xffc2,0xffc7);(0xffca,0xffcf);(0xffd2,0xffd7);(0xffd2,0xffd7);(0xffda,0xffdc)]