|
from enum import Enum |
|
from typing import List, Tuple, Union |
|
|
|
|
|
Offsets = Tuple[int, int] |
|
|
|
TextInputSequence = str |
|
"""A :obj:`str` that represents an input sequence """ |
|
|
|
PreTokenizedInputSequence = Union[List[str], Tuple[str]] |
|
"""A pre-tokenized input sequence. Can be one of: |
|
|
|
- A :obj:`List` of :obj:`str` |
|
- A :obj:`Tuple` of :obj:`str` |
|
""" |
|
|
|
TextEncodeInput = Union[ |
|
TextInputSequence, |
|
Tuple[TextInputSequence, TextInputSequence], |
|
List[TextInputSequence], |
|
] |
|
"""Represents a textual input for encoding. Can be either: |
|
|
|
- A single sequence: :data:`~tokenizers.TextInputSequence` |
|
- A pair of sequences: |
|
|
|
- A :obj:`Tuple` of :data:`~tokenizers.TextInputSequence` |
|
- Or a :obj:`List` of :data:`~tokenizers.TextInputSequence` of size 2 |
|
""" |
|
|
|
PreTokenizedEncodeInput = Union[ |
|
PreTokenizedInputSequence, |
|
Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence], |
|
List[PreTokenizedInputSequence], |
|
] |
|
"""Represents a pre-tokenized input for encoding. Can be either: |
|
|
|
- A single sequence: :data:`~tokenizers.PreTokenizedInputSequence` |
|
- A pair of sequences: |
|
|
|
- A :obj:`Tuple` of :data:`~tokenizers.PreTokenizedInputSequence` |
|
- Or a :obj:`List` of :data:`~tokenizers.PreTokenizedInputSequence` of size 2 |
|
""" |
|
|
|
InputSequence = Union[TextInputSequence, PreTokenizedInputSequence] |
|
"""Represents all the possible types of input sequences for encoding. Can be: |
|
|
|
- When ``is_pretokenized=False``: :data:`~TextInputSequence` |
|
- When ``is_pretokenized=True``: :data:`~PreTokenizedInputSequence` |
|
""" |
|
|
|
EncodeInput = Union[TextEncodeInput, PreTokenizedEncodeInput] |
|
"""Represents all the possible types of input for encoding. Can be: |
|
|
|
- When ``is_pretokenized=False``: :data:`~TextEncodeInput` |
|
- When ``is_pretokenized=True``: :data:`~PreTokenizedEncodeInput` |
|
""" |
|
|
|
|
|
class OffsetReferential(Enum): |
|
ORIGINAL = "original" |
|
NORMALIZED = "normalized" |
|
|
|
|
|
class OffsetType(Enum): |
|
BYTE = "byte" |
|
CHAR = "char" |
|
|
|
|
|
class SplitDelimiterBehavior(Enum): |
|
REMOVED = "removed" |
|
ISOLATED = "isolated" |
|
MERGED_WITH_PREVIOUS = "merged_with_previous" |
|
MERGED_WITH_NEXT = "merged_with_next" |
|
CONTIGUOUS = "contiguous" |
|
|
|
|
|
from .tokenizers import ( |
|
AddedToken, |
|
Encoding, |
|
NormalizedString, |
|
PreTokenizedString, |
|
Regex, |
|
Token, |
|
Tokenizer, |
|
decoders, |
|
models, |
|
normalizers, |
|
pre_tokenizers, |
|
processors, |
|
trainers, |
|
__version__, |
|
) |
|
from .implementations import ( |
|
BertWordPieceTokenizer, |
|
ByteLevelBPETokenizer, |
|
CharBPETokenizer, |
|
SentencePieceBPETokenizer, |
|
SentencePieceUnigramTokenizer, |
|
) |
|
|