Spaces:

Kaizouku
/

Multi-model-Chatbot

Sleeping

App Files Files Community

Multi-model-Chatbot / public /gpt-2 /transformers /models /herbert /tokenization_herbert.py

Kaizouku

Upload 564 files

2260825 verified 5 months ago

raw

history blame contribute delete

3.32 kB

	# coding=utf-8
	# Copyright 2020 The Google AI Language Team Authors, Allegro.pl, Facebook Inc. and the HuggingFace Inc. team.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	from ...utils import logging
	from ..bert.tokenization_bert import BasicTokenizer
	from ..xlm.tokenization_xlm import XLMTokenizer


	logger = logging.get_logger(__name__)

	VOCAB_FILES_NAMES = {
	"vocab_file": "vocab.json",
	"merges_file": "merges.txt",
	}

	PRETRAINED_VOCAB_FILES_MAP = {
	"vocab_file": {
	"allegro/herbert-base-cased": "https://huggingface.co/allegro/herbert-base-cased/resolve/main/vocab.json"
	},
	"merges_file": {
	"allegro/herbert-base-cased": "https://huggingface.co/allegro/herbert-base-cased/resolve/main/merges.txt"
	},
	}

	PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"allegro/herbert-base-cased": 514}
	PRETRAINED_INIT_CONFIGURATION = {}


	class HerbertTokenizer(XLMTokenizer):
	"""
	Construct a BPE tokenizer for HerBERT.

	Peculiarities:

	- uses BERT's pre-tokenizer: BaseTokenizer splits tokens on spaces, and also on punctuation. Each occurrence of a
	punctuation character will be treated separately.

	- Such pretokenized input is BPE subtokenized

	This tokenizer inherits from :class:`~transformers.XLMTokenizer` which contains most of the methods. Users should
	refer to the superclass for more information regarding methods.
	"""

	vocab_files_names = VOCAB_FILES_NAMES
	pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
	pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
	max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES

	def __init__(
	self,
	vocab_file,
	merges_file,
	tokenizer_file=None,
	cls_token="<s>",
	unk_token="<unk>",
	pad_token="<pad>",
	mask_token="<mask>",
	sep_token="</s>",
	do_lowercase_and_remove_accent=False,
	**kwargs
	):

	super().__init__(
	vocab_file,
	merges_file,
	tokenizer_file=None,
	cls_token=cls_token,
	unk_token=unk_token,
	pad_token=pad_token,
	mask_token=mask_token,
	sep_token=sep_token,
	do_lowercase_and_remove_accent=do_lowercase_and_remove_accent,
	**kwargs,
	)
	self.bert_pre_tokenizer = BasicTokenizer(
	do_lower_case=False,
	never_split=self.all_special_tokens,
	tokenize_chinese_chars=False,
	strip_accents=False,
	)

	def _tokenize(self, text):

	pre_tokens = self.bert_pre_tokenizer.tokenize(text)

	split_tokens = []
	for token in pre_tokens:
	if token:
	split_tokens.extend([t for t in self.bpe(token).split(" ")])

	return split_tokens