Add truncate option to tokenize (#126)
* Add truncate_text option to tokenize This makes it possible to run tokenize on texts that are longer than the number of tokens that fit the context length without having to try to guess how to cut in number of characters beforehand * add doc, rename to just "truncate", use eot_token Co-authored-by: Jong Wook Kim <jongwook@openai.com>
This commit is contained in:
parent
db20393f4a
commit
a2737ac264
|
@ -180,7 +180,7 @@ def load(name: str, device: Union[str, torch.device] = "cuda" if torch.cuda.is_a
|
|||
return model, _transform(model.input_resolution.item())
|
||||
|
||||
|
||||
def tokenize(texts: Union[str, List[str]], context_length: int = 77) -> torch.LongTensor:
|
||||
def tokenize(texts: Union[str, List[str]], context_length: int = 77, truncate: bool = False) -> torch.LongTensor:
|
||||
"""
|
||||
Returns the tokenized representation of given input string(s)
|
||||
|
||||
|
@ -192,6 +192,9 @@ def tokenize(texts: Union[str, List[str]], context_length: int = 77) -> torch.Lo
|
|||
context_length : int
|
||||
The context length to use; all CLIP models use 77 as the context length
|
||||
|
||||
truncate: bool
|
||||
Whether to truncate the text in case its encoding is longer than the context length
|
||||
|
||||
Returns
|
||||
-------
|
||||
A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length]
|
||||
|
@ -206,6 +209,10 @@ def tokenize(texts: Union[str, List[str]], context_length: int = 77) -> torch.Lo
|
|||
|
||||
for i, tokens in enumerate(all_tokens):
|
||||
if len(tokens) > context_length:
|
||||
if truncate:
|
||||
tokens = tokens[:context_length]
|
||||
tokens[-1] = eot_token
|
||||
else:
|
||||
raise RuntimeError(f"Input {texts[i]} is too long for context length {context_length}")
|
||||
result[i, :len(tokens)] = torch.tensor(tokens)
|
||||
|
||||
|
|
Loading…
Reference in New Issue