123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332(************************************************************************)(* Flèche => document manager: Language Support *)(* Copyright 2019-2024 Inria -- Dual License LGPL 2.1 / GPL3+ *)(* Written by: Emilio J. Gallego Arias *)(************************************************************************)(* LICENSE NOTE: this file includes code from camomille and OCaml stdlib (for
compatibilty). This is just out of convenience, the included functions are
quite trivial, and eventually we should be able to use OCaml's stdlib and
remove most of this code. *)(* Camomille Copyright: *)(* Copyright (C) 2002, 2003 Yamagata Yoriyuki. *)(* This library is free software; you can redistribute it and/or *)(* modify it under the terms of the GNU Lesser General Public License *)(* as published by the Free Software Foundation; either version 2 of *)(* the License, or (at your option) any later version. *)(* As a special exception to the GNU Library General Public License, you *)(* may link, statically or dynamically, a "work that uses this library" *)(* with a publicly distributed version of this library to produce an *)(* executable file containing portions of this library, and distribute *)(* that executable file under terms of your choice, without any of the *)(* additional requirements listed in clause 6 of the GNU Library General *)(* Public License. By "a publicly distributed version of this library", *)(* we mean either the unmodified Library as distributed by the authors, *)(* or a modified version of this library that is distributed under the *)(* conditions defined in clause 3 of the GNU Library General Public *)(* License. This exception does not however invalidate any other reasons *)(* why the executable file might be covered by the GNU Library General *)(* Public License . *)(* This library is distributed in the hope that it will be useful, *)(* but WITHOUT ANY WARRANTY; without even the implied warranty of *)(* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU *)(* Lesser General Public License for more details. *)(* You should have received a copy of the GNU Lesser General Public *)(* License along with this library; if not, write to the Free Software *)(* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 *)(* USA *)(** This module provides facilities for translating language-based locations to
protocol-based locations.
After a long discussion (thanks Léo !), we have decided that the best is to
have `Lang.Point` to store columns offset in the values that are native to
the protocol under consideration, set by the upper layers.
This scheme kind of follows what we have done since the start with coq-lsp. *)moduleEncoding=struct(* Used for char offsets *)typet=|Utf8|Utf16|Utf32end(* Future work: support multiple encondings *)(* val set_protocol_encoding : *)(* EJGA: Taken from Camomille, but note what I wrote below *)(* utf8 utils, both Coq and Camomile have similar implementations, at some point
we should remove this but for now we keep it internal. For now we use the
Camomille functions *)typeutf8_string=stringtypechar=inttypeutf8_index=inttypeutf16_index=int(* Taken from camomille *)(* Copyright (C) 2002, 2003 Yamagata Yoriyuki. *)letrecsearch_headsi=ifi>=String.lengthsthenielseletn=Char.code(String.unsafe_getsi)inifn<0x80||n>=0xc2thenielsesearch_heads(i+1)letnextsi=letn=Char.codes.[i]inifn<0x80theni+1elseifn<0xc0thensearch_heads(i+1)elseifn<=0xdftheni+2elseifn<=0xeftheni+3elseifn<=0xf7theni+4elseifn<=0xfbtheni+5elseifn<=0xfdtheni+6elseinvalid_arg"UTF8.next"letreclength_auxsci=ifi>=String.lengthsthencelseletn=Char.code(String.unsafe_getsi)inletk=ifn<0x80then1elseifn<0xc0theninvalid_arg"UTF8.length"elseifn<0xe0then2elseifn<0xf0then3elseifn<0xf8then4elseifn<0xfcthen5elseifn<0xfethen6elseinvalid_arg"UTF8.length"inlength_auxs(c+1)(i+k)letlengths=length_auxs00letrecnth_auxsin=ifn=0thenielsenth_auxs(nextsi)(n-1)letnthsn=nth_auxs0n(* end of camomille *)(* We disabled auto-formatting in copied code *)[@@@ocamlformat"disable=true"](* The following is copied from Ocaml's standard library Bytes and Uchar
modules. We use the public safe variant of various functions, so it should be
slower.
TODO: when our minimum supported Ocaml version is >= 4.14 we shoud switch to
the standard library. *)(* From Uchar.ml *)letrep=0xFFFDletdecode_bits=24let[@inline]utf_decodenu=((8lorn)lsldecode_bits)lor(Uchar.to_intu)let[@inline]utf_decode_invalidn=(nlsldecode_bits)lorreplet[@inline]uchar_utf_decode_uchard=Uchar.unsafe_of_int(dland0xFFFFFF)letuchar_utf_16_byte_lengthu=matchUchar.to_intuwith|uwhenu<0->assertfalse|uwhenu<=0xFFFF->2|uwhenu<=0x10FFFF->4|_->assertfalse(* From bytes.ml *)let[@inline]not_in_x80_to_xBFb=blsr6<>0b10let[@inline]not_in_xA0_to_xBFb=blsr5<>0b101let[@inline]not_in_x80_to_x9Fb=blsr5<>0b100let[@inline]not_in_x90_to_xBFb=b<0x90||0xBF<blet[@inline]not_in_x80_to_x8Fb=blsr4<>0x8let[@inline]utf_8_uchar_2b0b1=((b0land0x1F)lsl6)lor((b1land0x3F))let[@inline]utf_8_uchar_3b0b1b2=((b0land0x0F)lsl12)lor((b1land0x3F)lsl6)lor((b2land0x3F))let[@inline]utf_8_uchar_4b0b1b2b3=((b0land0x07)lsl18)lor((b1land0x3F)lsl12)lor((b2land0x3F)lsl6)lor((b3land0x3F))let[@inline]dec_retnu=utf_decoden(Uchar.unsafe_of_intu)letdec_invalid=utf_decode_invalidletstring_get_utf_8_ucharsi=letb=Bytes.unsafe_of_stringsinletb0=Bytes.get_uint8biin(* raises if [i] is not a valid index. *)letget=Bytes.get_uint8inletmax=Bytes.lengthb-1inmatchChar.unsafe_chrb0with(* See The Unicode Standard, Table 3.7 *)|'\x00'..'\x7F'->dec_ret1b0|'\xC2'..'\xDF'->leti=i+1inifi>maxthendec_invalid1elseletb1=getbiinifnot_in_x80_to_xBFb1thendec_invalid1elsedec_ret2(utf_8_uchar_2b0b1)|'\xE0'->leti=i+1inifi>maxthendec_invalid1elseletb1=getbiinifnot_in_xA0_to_xBFb1thendec_invalid1elseleti=i+1inifi>maxthendec_invalid2elseletb2=getbiinifnot_in_x80_to_xBFb2thendec_invalid2elsedec_ret3(utf_8_uchar_3b0b1b2)|'\xE1'..'\xEC'|'\xEE'..'\xEF'->leti=i+1inifi>maxthendec_invalid1elseletb1=getbiinifnot_in_x80_to_xBFb1thendec_invalid1elseleti=i+1inifi>maxthendec_invalid2elseletb2=getbiinifnot_in_x80_to_xBFb2thendec_invalid2elsedec_ret3(utf_8_uchar_3b0b1b2)|'\xED'->leti=i+1inifi>maxthendec_invalid1elseletb1=getbiinifnot_in_x80_to_x9Fb1thendec_invalid1elseleti=i+1inifi>maxthendec_invalid2elseletb2=getbiinifnot_in_x80_to_xBFb2thendec_invalid2elsedec_ret3(utf_8_uchar_3b0b1b2)|'\xF0'->leti=i+1inifi>maxthendec_invalid1elseletb1=getbiinifnot_in_x90_to_xBFb1thendec_invalid1elseleti=i+1inifi>maxthendec_invalid2elseletb2=getbiinifnot_in_x80_to_xBFb2thendec_invalid2elseleti=i+1inifi>maxthendec_invalid3elseletb3=getbiinifnot_in_x80_to_xBFb3thendec_invalid3elsedec_ret4(utf_8_uchar_4b0b1b2b3)|'\xF1'..'\xF3'->leti=i+1inifi>maxthendec_invalid1elseletb1=getbiinifnot_in_x80_to_xBFb1thendec_invalid1elseleti=i+1inifi>maxthendec_invalid2elseletb2=getbiinifnot_in_x80_to_xBFb2thendec_invalid2elseleti=i+1inifi>maxthendec_invalid3elseletb3=getbiinifnot_in_x80_to_xBFb3thendec_invalid3elsedec_ret4(utf_8_uchar_4b0b1b2b3)|'\xF4'->leti=i+1inifi>maxthendec_invalid1elseletb1=getbiinifnot_in_x80_to_x8Fb1thendec_invalid1elseleti=i+1inifi>maxthendec_invalid2elseletb2=getbiinifnot_in_x80_to_xBFb2thendec_invalid2elseleti=i+1inifi>maxthendec_invalid3elseletb3=getbiinifnot_in_x80_to_xBFb3thendec_invalid3elsedec_ret4(utf_8_uchar_4b0b1b2b3)|_->dec_invalid1(* End of copy from Stdlib *)[@@@ocamlformat"disable=false"]letlength_utf16line=letbyte_idx=ref0inletutf16_len=ref0inletlen=String.lengthlineinwhile!byte_idx<lendoletch=string_get_utf_8_ucharline!byte_idxinletnext_idx=nextline!byte_idxinbyte_idx:=next_idx;letl=uchar_utf_16_byte_length(uchar_utf_decode_ucharch)/2inutf16_len:=!utf16_len+ldone;!utf16_len(* UTF16 <-> UTF8 *)letutf8_offset_of_utf16_offset~line~(offset:utf16_index)=letbyte_idx=ref0inletutf16_char_count=ref0inletlen=String.lengthlinein(trywhile!utf16_char_count<offsetdoletch=string_get_utf_8_ucharline!byte_idxinletnext_idx=nextline!byte_idxinifnext_idx>=lenthenraiseNot_foundelsebyte_idx:=next_idx;letcode_unit_count=uchar_utf_16_byte_length(uchar_utf_decode_ucharch)/2inutf16_char_count:=!utf16_char_count+code_unit_count;()donewith_->());!byte_idxletutf16_offset_of_utf8_offset~line~(offset:utf8_index)=letbyte_idx=ref0inletutf16_char_count=ref0inletlen=String.lengthlinein(trywhile!byte_idx<offsetdoletch=string_get_utf_8_ucharline!byte_idxinletnext_idx=nextline!byte_idxinifnext_idx>lenthenraiseNot_foundelsebyte_idx:=next_idx;letcode_unit_count=uchar_utf_16_byte_length(uchar_utf_decode_ucharch)/2inutf16_char_count:=!utf16_char_count+code_unit_count;()donewith_->());!utf16_char_count(******************************************************)(** Not used anywhere, remove? *)(******************************************************)(* UTF16 <-> Char *)letchar_of_utf16_offset~line~(offset:utf16_index)=letbyte_idx=ref0inletcount=ref0inletutf16_char_count=ref0inletlen=String.lengthlinein(trywhile!utf16_char_count<offsetdoletch=string_get_utf_8_ucharline!byte_idxinletnext_idx=nextline!byte_idxinifnext_idx>=lenthenraiseNot_foundelsebyte_idx:=next_idx;letcode_unit_count=uchar_utf_16_byte_length(uchar_utf_decode_ucharch)/2inutf16_char_count:=!utf16_char_count+code_unit_count;count:=!count+1;()donewith_->());!countletutf16_offset_of_char~line~(char:char)=letoffset16=ref0inletidx=ref0infor_=0tochar-1doletch=string_get_utf_8_ucharline!idxinletbyte_len=uchar_utf_16_byte_length(uchar_utf_decode_ucharch)inoffset16:=!offset16+(byte_len/2);idx:=nextline!idxdone;!offset16(* UTF8 <-> Char *)(* That's a tricky one, if the char we are requesting is out of bounds, then we
return the last index, 0 in the case line is empty. *)letutf8_offset_of_char~line~char=ifchar<lengthlinethenSome(nthlinechar)elseNoneletfind_charlinebyte=letrecfindexn_chars=letnext_index=nextlineindexinifnext_index>bytethenn_charselsefnext_index(n_chars+1)inifbyte<String.lengthlinethenSome(f00)elseNoneletchar_of_utf8_offset~line~offset=(* if Debug.unicode then *)(* Io.Log.trace "char_of_index" *)(* (Format.asprintf "str: '%s' | byte: %d" line byte); *)letchar=find_charlineoffsetin(* (if Debug.unicode then *)(* match char with *)(* | None -> Io.Log.trace "get_last_text" "failed" *)(* | Some char -> Io.Log.trace "get_last_text" (Format.asprintf "char: %d"
char)); *)char