Saga_tokenizers.TrainersSourceTraining module for tokenization models.
Main trainer type
val bpe :
?vocab_size:int ->
?min_frequency:int ->
?special_tokens:string list ->
?limit_alphabet:int ->
?initial_alphabet:string list ->
?continuing_subword_prefix:string ->
?end_of_word_suffix:string ->
?show_progress:bool ->
?max_token_length:int ->
unit ->
tCreate a BPE trainer.
val wordpiece :
?vocab_size:int ->
?min_frequency:int ->
?special_tokens:string list ->
?limit_alphabet:int ->
?initial_alphabet:string list ->
?continuing_subword_prefix:string ->
?end_of_word_suffix:string ->
?unk_token:string ->
?show_progress:bool ->
unit ->
tCreate a WordPiece trainer.
val word_level :
?vocab_size:int ->
?min_frequency:int ->
?special_tokens:string list ->
?show_progress:bool ->
unit ->
tCreate a WordLevel trainer.
val unigram :
?vocab_size:int ->
?n_sub_iterations:int ->
?shrinking_factor:float ->
?unk_token:string ->
?special_tokens:string list ->
?show_progress:bool ->
?initial_alphabet:string list ->
?max_piece_length:int ->
unit ->
tCreate a Unigram trainer.
val chars :
?min_frequency:int ->
?special_tokens:string list ->
?show_progress:bool ->
unit ->
tCreate a character-level trainer.
Train a model on the given files.
val train_from_iterator :
t ->
iterator:(unit -> string option) ->
?model:Models.t ->
unit ->
training_resultTrain a model from an iterator.