Spaces:
Sleeping
Sleeping
# coding=utf-8 | |
# Copyright 2021 The Open AI Team Authors and The HuggingFace Inc. team. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
"""Tokenization classes for OpenAI GPT.""" | |
import json | |
from typing import Optional, Tuple | |
from tokenizers import pre_tokenizers | |
from ...tokenization_utils_base import BatchEncoding | |
from ...tokenization_utils_fast import PreTrainedTokenizerFast | |
from ...utils import logging | |
from .tokenization_clip import CLIPTokenizer | |
logger = logging.get_logger(__name__) | |
VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"} | |
PRETRAINED_VOCAB_FILES_MAP = { | |
"vocab_file": { | |
"openai/clip-vit-base-patch32": "https://huggingface.co/openai/clip-vit-base-patch32/resolve/main/vocab.json", | |
}, | |
"merges_file": { | |
"openai/clip-vit-base-patch32": "https://huggingface.co/openai/clip-vit-base-patch32/resolve/main/merges.txt", | |
}, | |
"tokenizer_file": { | |
"openai/clip-vit-base-patch32": "https://huggingface.co/openai/clip-vit-base-patch32/resolve/main/tokenizer.json", | |
}, | |
} | |
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { | |
"openai/clip-vit-base-patch32": 77, | |
} | |
class CLIPTokenizerFast(PreTrainedTokenizerFast): | |
""" | |
Construct a "fast" CLIP tokenizer (backed by HuggingFace's `tokenizers` library). Based on byte-level | |
Byte-Pair-Encoding. | |
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will | |
be encoded differently whether it is at the beginning of the sentence (without space) or not: | |
:: | |
>>> from transformers import CLIPTokenizerFast | |
>>> tokenizer = CLIPTokenizerFast.from_pretrained("openai/clip-vit-base-patch32") | |
>>> tokenizer("Hello world")['input_ids'] | |
[15496, 995] | |
>>> tokenizer(" Hello world")['input_ids'] | |
[18435, 995] | |
You can get around that behavior by passing ``add_prefix_space=True`` when instantiating this tokenizer or when you | |
call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance. | |
.. note:: | |
When used with ``is_split_into_words=True``, this tokenizer needs to be instantiated with | |
``add_prefix_space=True``. | |
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main | |
methods. Users should refer to this superclass for more information regarding those methods. | |
Args: | |
vocab_file (:obj:`str`): | |
Path to the vocabulary file. | |
merges_file (:obj:`str`): | |
Path to the merges file. | |
errors (:obj:`str`, `optional`, defaults to :obj:`"replace"`): | |
Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode | |
<https://docs.python.org/3/library/stdtypes.html#bytes.decode>`__ for more information. | |
unk_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`): | |
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this | |
token instead. | |
bos_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`): | |
The beginning of sequence token. | |
eos_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`): | |
The end of sequence token. | |
add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`False`): | |
Whether or not to add an initial space to the input. This allows to treat the leading word just as any | |
other word. (CLIP tokenizer detect beginning of words by the preceding space). | |
trim_offsets (:obj:`bool`, `optional`, defaults to :obj:`True`): | |
Whether or not the post-processing step should trim offsets to avoid including whitespaces. | |
""" | |
vocab_files_names = VOCAB_FILES_NAMES | |
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP | |
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES | |
model_input_names = ["input_ids", "attention_mask"] | |
slow_tokenizer_class = CLIPTokenizer | |
def __init__( | |
self, | |
vocab_file=None, | |
merges_file=None, | |
tokenizer_file=None, | |
unk_token="<|endoftext|>", | |
bos_token="<|startoftext|>", | |
eos_token="<|endoftext|>", | |
pad_token="<|endoftext|>", # hack to enable padding | |
add_prefix_space=False, | |
**kwargs | |
): | |
super().__init__( | |
vocab_file, | |
merges_file, | |
tokenizer_file=tokenizer_file, | |
unk_token=unk_token, | |
bos_token=bos_token, | |
eos_token=eos_token, | |
pad_token=pad_token, | |
add_prefix_space=add_prefix_space, | |
**kwargs, | |
) | |
pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__()) | |
if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space: | |
pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type")) | |
pre_tok_state["add_prefix_space"] = add_prefix_space | |
self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state) | |
self.add_prefix_space = add_prefix_space | |
# Very ugly hack to enable padding | |
def pad_token_id(self) -> Optional[int]: | |
""" | |
:obj:`Optional[int]`: Id of the padding token in the vocabulary. Returns :obj:`None` if the token has not been | |
set. | |
""" | |
return 0 | |
def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding: | |
is_split_into_words = kwargs.get("is_split_into_words", False) | |
assert self.add_prefix_space or not is_split_into_words, ( | |
f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True " | |
"to use it with pretokenized inputs." | |
) | |
return super()._batch_encode_plus(*args, **kwargs) | |
def _encode_plus(self, *args, **kwargs) -> BatchEncoding: | |
is_split_into_words = kwargs.get("is_split_into_words", False) | |
assert self.add_prefix_space or not is_split_into_words, ( | |
f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True " | |
"to use it with pretokenized inputs." | |
) | |
return super()._encode_plus(*args, **kwargs) | |
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: | |
files = self._tokenizer.model.save(save_directory, name=filename_prefix) | |
return tuple(files) | |