123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172(*
* Copyright (c) Meta Platforms, Inc. and affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*)(* table from 0-based line number and 0-based column number to the offset at that point *)typet=intarrayarraytypeoffset_kind=|Utf8|JavaScript(* Classify each codepoint. We care about how many bytes each codepoint takes, in order to
compute offsets in terms of bytes instead of codepoints. We also care about various kinds of
newlines. To reduce memory, it is important that this is a basic variant with no parameters
(so, don't make it `Chars of int`). *)typekind=(* Char has a codepoint greater than or equal to 0x0 but less than 0x80 *)|Chars_0x0(* Char has a codepoint greater than or equal to 0x80 but less than 0x800 *)|Chars_0x80|Chars_0x800|Chars_0x10000|Malformed|Cr|Nl|Ls(* Gives the size in bytes of the character's UTF-8 encoding *)letutf8_size_of_kind=function|Chars_0x0->1|Chars_0x80->2|Chars_0x800->3|Chars_0x10000->4|Malformed->1|Cr->1|Nl->1|Ls->3(* Gives the size in code units (16-bit blocks) of the character's UTF-16 encoding *)letjs_size_of_kind=function|Chars_0x0|Chars_0x80|Chars_0x800->1|Chars_0x10000->2|Malformed->1|Cr->1|Nl->1|Ls->1letmake=(* Using Wtf8 allows us to properly track multi-byte characters, so that we increment the column
* by 1 for a multi-byte character, but increment the offset by the number of bytes in the
* character. It also keeps us from incrementing the line number if a multi-byte character happens
* to include e.g. the codepoint for '\n' as a second-fourth byte. *)letfold_codepointsacc_offsetchr=letkind=matchchrwith|Wtf8.Pointcode->ifcode==0x2028||code==0x2029thenLselseifcode==0xAthenNlelseifcode==0xDthenCrelseifcode>=0x10000thenChars_0x10000elseifcode>=0x800thenChars_0x800elseifcode>=0x80thenChars_0x80elseChars_0x0|Wtf8.Malformed->Malformedinkind::accin(* Traverses a `kind list`, breaking it up into an `int array array`, where each `int array`
contains the offsets at each character (aka codepoint) of a line. *)letrecbuild_tablesize_of_kind(offset,rev_line,acc)=function|[]->Array.of_list(List.revacc)|Cr::Nl::rest->(* https://www.ecma-international.org/ecma-262/5.1/#sec-7.3 says that "\r\n" should be treated
like a single line terminator, even though both '\r' and '\n' are line terminators in their
own right. *)letline=Array.of_list(List.rev(offset::rev_line))inbuild_tablesize_of_kind(offset+2,[],line::acc)rest|((Cr|Nl|Ls)askind)::rest->letline=Array.of_list(List.rev(offset::rev_line))inbuild_tablesize_of_kind(offset+size_of_kindkind,[],line::acc)rest|((Chars_0x0|Chars_0x80|Chars_0x800|Chars_0x10000|Malformed)askind)::rest->build_tablesize_of_kind(offset+size_of_kindkind,offset::rev_line,acc)restinfun~kindtext->letrev_kinds=Wtf8.fold_wtf_8fold_codepoints[]textin(* Add a phantom line at the end of the file. Since end positions are reported exclusively, it
* is possible for the lexer to output an end position with a line number one higher than the
* last line, to indicate something such as "the entire last line." For this purpose, we can
* return the offset that is one higher than the last legitimate offset, since it could only be
* correctly used as an exclusive index. *)letrev_kinds=Nl::rev_kindsinletsize_of_kind=matchkindwith|Utf8->utf8_size_of_kind|JavaScript->js_size_of_kindinbuild_tablesize_of_kind(0,[],[])(List.revrev_kinds)exceptionOffset_lookup_failedofLoc.position*stringletlookuparriposcontext_string=tryarr.(i)with|Invalid_argument_->letmsg=Printf.sprintf"Failure while looking up %s. Index: %d. Length: %d."context_stringi(Array.lengtharr)inraise(Offset_lookup_failed(pos,msg))letoffsettablepos=Loc.((* Special-case `Loc.none` so we don't try to look up line -1. *)ifpos.line=0&&pos.column=0then(* Loc.none sets the offset as 0, so that's what we'll return here. *)0else(* lines are 1-indexed, columns are zero-indexed *)letline_table=lookuptable(pos.line-1)pos"line"inlookupline_tablepos.columnpos"column")letdebug_stringtable=letbuf=Buffer.create4096inArray.iteri(funline_numline->Printf.bprintfbuf"%6d: "line_num;Array.iter(funoffset->Printf.bprintfbuf"%8d "offset)line;Buffer.add_charbuf'\n')table;Buffer.contentsbufletline_lengthstable=Array.fold_left(fun(prev_line_end,lengths_rev)line->letline_end=line.(Array.lengthline-1)in(line_end,(line_end-prev_line_end)::lengths_rev))(-1,[])table|>snd|>List.revletcontains_multibyte_charactertable=letexceptionFoundMultibyteintryArray.iter(funline->Array.iteri(funioffset->ifi>0thenletoffset_before=line.(i-1)inifoffset-offset_before>1thenraiseFoundMultibyte)line)table;falsewith|FoundMultibyte->true