Spaces:

Kaizouku
/

Multi-model-Chatbot

Sleeping

App Files Files Community

Multi-model-Chatbot / public /gpt-2 /transformers /models /openai /tokenization_openai_fast.py

Kaizouku

Upload 564 files

2260825 verified about 1 year ago

raw

history blame contribute delete

3.09 kB

	# coding=utf-8
	# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	"""Fast Tokenization classes for OpenAI GPT."""


	from typing import Optional, Tuple

	from ...tokenization_utils_fast import PreTrainedTokenizerFast
	from ...utils import logging
	from .tokenization_openai import OpenAIGPTTokenizer


	logger = logging.get_logger(__name__)

	VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}

	PRETRAINED_VOCAB_FILES_MAP = {
	"vocab_file": {"openai-gpt": "https://huggingface.co/openai-gpt/resolve/main/vocab.json"},
	"merges_file": {"openai-gpt": "https://huggingface.co/openai-gpt/resolve/main/merges.txt"},
	"tokenizer_file": {"openai-gpt": "https://huggingface.co/openai-gpt/resolve/main/tokenizer.json"},
	}

	PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
	"openai-gpt": 512,
	}


	class OpenAIGPTTokenizerFast(PreTrainedTokenizerFast):
	"""
	Construct a "fast" GPT Tokenizer (backed by HuggingFace's `tokenizers` library). Based on Byte-Pair-Encoding with
	the following peculiarities:

	- lower case all inputs
	- uses BERT's BasicTokenizer for pre-BPE tokenization

	This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
	methods. Users should refer to this superclass for more information regarding those methods.

	Args:
	vocab_file (:obj:`str`):
	Path to the vocabulary file.
	merges_file (:obj:`str`):
	Path to the merges file.
	unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
	The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
	token instead.
	"""

	vocab_files_names = VOCAB_FILES_NAMES
	pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
	max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
	model_input_names = ["input_ids", "attention_mask"]
	slow_tokenizer_class = OpenAIGPTTokenizer

	def __init__(self, vocab_file=None, merges_file=None, tokenizer_file=None, unk_token="<unk>", **kwargs):
	super().__init__(vocab_file, merges_file, tokenizer_file=tokenizer_file, unk_token=unk_token, **kwargs)

	@property
	def do_lower_case(self):
	return True

	def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
	files = self._tokenizer.model.save(save_directory, name=filename_prefix)
	return tuple(files)