Tokenizer (u.95651758f926ec81d627efae6ee3c604.saga.1.0.0~alpha1.doc.saga.tokenizers.Saga

Sourcetype t

Main tokenizer type.

Sourcetype padding_config = {

direction : direction;
pad_id : int;
pad_type_id : int;
pad_token : string;
length : int option;
pad_to_multiple_of : int option;

}

Record for padding config.

Sourcetype truncation_config = {

max_length : int;
stride : int;
strategy : strategy;
direction : direction;

}

Record for truncation config.

Sourceval create : model:Models.t -> t

Creation

Create with core model.

Sourceval from_file : string -> (t, exn) result

From JSON file with result.

Sourceval from_str : string -> (t, exn) result

From JSON string with result.

Source

val from_pretrained : 
  string ->
  ?revision:string ->
  ?token:string ->
  unit ->
  (t, exn) result

From pretrained with result and defaults.

Sourceval from_buffer : bytes -> (t, exn) result

From buffer with result.

Sourceval set_normalizer : t -> Normalizers.t option -> unit

Configuration

Set normalizer.

Sourceval get_normalizer : t -> Normalizers.t option

Get normalizer.

Sourceval set_pre_tokenizer : t -> Pre_tokenizers.t option -> unit

Set pre-tokenizer.

Sourceval get_pre_tokenizer : t -> Pre_tokenizers.t option

Get pre-tokenizer.

Sourceval set_post_processor : t -> Processors.t option -> unit

Set post-processor.

Sourceval get_post_processor : t -> Processors.t option

Get post-processor.

Sourceval set_decoder : t -> Decoders.t option -> unit

Set decoder.

Sourceval get_decoder : t -> Decoders.t option

Get decoder.

Sourceval set_model : t -> Models.t -> unit

Set model.

Sourceval get_model : t -> Models.t

Get model.

Sourceval enable_padding : t -> padding_config -> unit

Padding and Truncation

Enable padding with record config.

Sourceval no_padding : t -> unit

Disable padding.

Sourceval get_padding : t -> padding_config option

Get padding config.

Sourceval enable_truncation : t -> truncation_config -> unit

Enable truncation with record config.

Sourceval no_truncation : t -> unit

Disable truncation.

Sourceval get_truncation : t -> truncation_config option

Get truncation config.

Sourceval add_tokens : t -> (string, Added_token.t) Either.t list -> int

Vocabulary Management

Add tokens, return count added.

Sourceval add_special_tokens : t -> (string, Added_token.t) Either.t list -> int

Add special tokens.

Sourceval get_vocab : t -> ?with_added_tokens:bool -> unit -> (string * int) list

Get vocab list with default.

Sourceval get_vocab_size : t -> ?with_added_tokens:bool -> unit -> int

Get size with default.

Sourceval get_added_tokens_decoder : t -> (int * Added_token.t) list

Get added tokens.

Sourceval token_to_id : t -> string -> int option

Token to id.

Sourceval id_to_token : t -> int -> string option

Id to token.

Sourceval train : t -> files:string list -> ?trainer:Trainers.t -> unit -> unit

Training

Train from files.

Source

val train_from_iterator : 
  t ->
  string Seq.t ->
  ?trainer:Trainers.t ->
  ?length:int ->
  unit ->
  unit

Train from text sequence.

Source

val encode : 
  t ->
  sequence:(string, string list) Either.t ->
  ?pair:(string, string list) Either.t ->
  ?is_pretokenized:bool ->
  ?add_special_tokens:bool ->
  unit ->
  Encoding.t

Encoding and Decoding

Encode single or pair, allowing pretokenized lists.

Source

val encode_batch : 
  t ->
  input:
    ((string, string list) Either.t,
      (string, string list) Either.t * (string, string list) Either.t)
      Either.t
      list ->
  ?is_pretokenized:bool ->
  ?add_special_tokens:bool ->
  unit ->
  Encoding.t list

Batch encode with flexible inputs.

Source

val decode : 
  t ->
  int list ->
  ?skip_special_tokens:bool ->
  ?clean_up_tokenization_spaces:bool ->
  unit ->
  string

Decode with defaults.

Source

val decode_batch : 
  t ->
  int list list ->
  ?skip_special_tokens:bool ->
  ?clean_up_tokenization_spaces:bool ->
  unit ->
  string list

Batch decode with defaults.

Source

val post_process : 
  t ->
  encoding:Encoding.t ->
  ?pair:Encoding.t ->
  ?add_special_tokens:bool ->
  unit ->
  Encoding.t

Post-process manually.

Sourceval num_special_tokens_to_add : t -> is_pair:bool -> int

Number of specials.

Sourceval save : t -> path:string -> ?pretty:bool -> unit -> unit

Serialization

Save to file with pretty default.

Sourceval save_pretrained : t -> path:string -> unit

Save pretrained format.

Sourceval to_str : t -> ?pretty:bool -> unit -> string

To JSON string with default.