Spaces:
Sleeping
Sleeping
File size: 5,634 Bytes
2260825 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
# coding=utf-8
# Copyright 2019-present CNRS, Facebook Inc. and the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for Flaubert, based on XLM."""
import unicodedata
import six
from ...utils import logging
from ..xlm.tokenization_xlm import XLMTokenizer
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {
"vocab_file": "vocab.json",
"merges_file": "merges.txt",
}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"flaubert/flaubert_small_cased": "https://huggingface.co/flaubert/flaubert_small_cased/resolve/main/vocab.json",
"flaubert/flaubert_base_uncased": "https://huggingface.co/flaubert/flaubert_base_uncased/resolve/main/vocab.json",
"flaubert/flaubert_base_cased": "https://huggingface.co/flaubert/flaubert_base_cased/resolve/main/vocab.json",
"flaubert/flaubert_large_cased": "https://huggingface.co/flaubert/flaubert_large_cased/resolve/main/vocab.json",
},
"merges_file": {
"flaubert/flaubert_small_cased": "https://huggingface.co/flaubert/flaubert_small_cased/resolve/main/merges.txt",
"flaubert/flaubert_base_uncased": "https://huggingface.co/flaubert/flaubert_base_uncased/resolve/main/merges.txt",
"flaubert/flaubert_base_cased": "https://huggingface.co/flaubert/flaubert_base_cased/resolve/main/merges.txt",
"flaubert/flaubert_large_cased": "https://huggingface.co/flaubert/flaubert_large_cased/resolve/main/merges.txt",
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"flaubert/flaubert_small_cased": 512,
"flaubert/flaubert_base_uncased": 512,
"flaubert/flaubert_base_cased": 512,
"flaubert/flaubert_large_cased": 512,
}
PRETRAINED_INIT_CONFIGURATION = {
"flaubert/flaubert_small_cased": {"do_lowercase": False},
"flaubert/flaubert_base_uncased": {"do_lowercase": True},
"flaubert/flaubert_base_cased": {"do_lowercase": False},
"flaubert/flaubert_large_cased": {"do_lowercase": False},
}
def convert_to_unicode(text):
"""
Converts `text` to Unicode (if it's not already), assuming UTF-8 input.
"""
# six_ensure_text is copied from https://github.com/benjaminp/six
def six_ensure_text(s, encoding="utf-8", errors="strict"):
if isinstance(s, six.binary_type):
return s.decode(encoding, errors)
elif isinstance(s, six.text_type):
return s
else:
raise TypeError(f"not expecting type '{type(s)}'")
return six_ensure_text(text, encoding="utf-8", errors="ignore")
class FlaubertTokenizer(XLMTokenizer):
"""
Construct a Flaubert tokenizer. Based on Byte-Pair Encoding. The tokenization process is the following:
- Moses preprocessing and tokenization.
- Normalizing all inputs text.
- The arguments ``special_tokens`` and the function ``set_special_tokens``, can be used to add additional symbols
(like "__classify__") to a vocabulary.
- The argument :obj:`do_lowercase` controls lower casing (automatically set for pretrained vocabularies).
This tokenizer inherits from :class:`~transformers.XLMTokenizer`. Please check the superclass for usage examples
and documentation regarding arguments.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def __init__(self, do_lowercase=False, **kwargs):
super().__init__(**kwargs)
self.do_lowercase = do_lowercase
self.do_lowercase_and_remove_accent = False
def preprocess_text(self, text):
text = text.replace("``", '"').replace("''", '"')
text = convert_to_unicode(text)
text = unicodedata.normalize("NFC", text)
if self.do_lowercase:
text = text.lower()
return text
def _tokenize(self, text, bypass_tokenizer=False):
"""
Tokenize a string given language code using Moses.
Details of tokenization:
- [sacremoses](https://github.com/alvations/sacremoses): port of Moses
- Install with `pip install sacremoses`
Args:
- bypass_tokenizer: Allow users to preprocess and tokenize the sentences externally (default = False)
(bool). If True, we only apply BPE.
Returns:
List of tokens.
"""
lang = "fr"
if lang and self.lang2id and lang not in self.lang2id:
logger.error(
"Supplied language code not found in lang2id mapping. Please check that your language is supported by the loaded pretrained model."
)
if bypass_tokenizer:
text = text.split()
else:
text = self.preprocess_text(text)
text = self.moses_pipeline(text, lang=lang)
text = self.moses_tokenize(text, lang=lang)
split_tokens = []
for token in text:
if token:
split_tokens.extend([t for t in self.bpe(token).split(" ")])
return split_tokens
|