Saga_tokenizers.ModelsSourceTokenization models module.
Tokenization result
type bpe_model = {vocab : (string, int) Hashtbl.t;merges : (string * string) list;cache_capacity : int;dropout : float option;unk_token : string option;continuing_subword_prefix : string option;end_of_word_suffix : string option;fuse_unk : bool;byte_fallback : bool;}Model configurations
type t = | BPE of bpe_model| WordPiece of wordpiece_model| WordLevel of wordlevel_model| Unigram of unigram_modelMain model type
val bpe :
?vocab:(string * int) list ->
?merges:(string * string) list ->
?cache_capacity:int ->
?dropout:float ->
?unk_token:string ->
?continuing_subword_prefix:string ->
?end_of_word_suffix:string ->
?fuse_unk:bool ->
?byte_fallback:bool ->
?ignore_merges:bool ->
unit ->
tCreate a BPE model
val wordpiece :
?vocab:(string * int) list ->
?unk_token:string ->
?continuing_subword_prefix:string ->
?max_input_chars_per_word:int ->
unit ->
tCreate a WordPiece model
Create a WordLevel model
val unigram :
?vocab:(string * float) list ->
?unk_token:string ->
?byte_fallback:bool ->
?max_piece_length:int ->
?n_sub_iterations:int ->
?shrinking_factor:float ->
unit ->
tCreate a Unigram model
Add tokens to the model's vocabulary. Returns number of tokens added.