huggingface_tokenizer

Bases: BaseTokenizer

Source Code in griptape/tokenizers/huggingface_tokenizer.py
@define()
class HuggingFaceTokenizer(BaseTokenizer):
    tokenizer: PreTrainedTokenizerBase = field(
        default=Factory(
            lambda self: import_optional_dependency("transformers").AutoTokenizer.from_pretrained(self.model),
            takes_self=True,
        ),
        kw_only=True,
    )
    _max_input_tokens: int = field(
        default=Factory(lambda self: self.tokenizer.model_max_length, takes_self=True),
        kw_only=True,
        alias="max_input_tokens",
    )
    _max_output_tokens: int = field(default=4096, kw_only=True, alias="max_output_tokens")

    def count_tokens(self, text: str) -> int:
        return len(self.tokenizer.encode(text))  # pyright: ignore[reportArgumentType]
  • _max_input_tokens = field(default=Factory(lambda self: self.tokenizer.model_max_length, takes_self=True), kw_only=True, alias='max_input_tokens') class-attribute instance-attribute

  • _max_output_tokens = field(default=4096, kw_only=True, alias='max_output_tokens') class-attribute instance-attribute

  • tokenizer = field(default=Factory(lambda self: import_optional_dependency('transformers').AutoTokenizer.from_pretrained(self.model), takes_self=True), kw_only=True) class-attribute instance-attribute

count_tokens(text)

Source Code in griptape/tokenizers/huggingface_tokenizer.py
def count_tokens(self, text: str) -> int:
    return len(self.tokenizer.encode(text))  # pyright: ignore[reportArgumentType]

Could this page be better? Report a problem or suggest an addition!