File size: 2,155 Bytes
acd7cf4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
from dataclasses import dataclass
from typing import List
import tiktoken

try:
    from transformers import AutoTokenizer
    TRANSFORMERS_AVAILABLE = True
except ImportError:
    AutoTokenizer = None
    TRANSFORMERS_AVAILABLE = False


def get_tokenizer(tokenizer_name: str = "cl100k_base"):
    """
    Get a tokenizer instance by name.

    :param tokenizer_name: tokenizer name, tiktoken encoding name or Hugging Face model name
    :return: tokenizer instance
    """
    if tokenizer_name in tiktoken.list_encoding_names():
        return tiktoken.get_encoding(tokenizer_name)
    if TRANSFORMERS_AVAILABLE:
        try:
            return AutoTokenizer.from_pretrained(tokenizer_name)
        except Exception as e:
            raise ValueError(f"Failed to load tokenizer from Hugging Face: {e}") from e
    else:
        raise ValueError("Hugging Face Transformers is not available, please install it first.")

@dataclass
class Tokenizer:
    model_name: str = "cl100k_base"

    def __post_init__(self):
        self.tokenizer = get_tokenizer(self.model_name)

    def encode_string(self, text: str) -> List[int]:
        """
        Encode text to tokens

        :param text
        :return: tokens
        """
        return self.tokenizer.encode(text)

    def decode_tokens(self, tokens: List[int]) -> str:
        """
        Decode tokens to text

        :param tokens
        :return: text
        """
        return self.tokenizer.decode(tokens)

    def chunk_by_token_size(
        self, content: str, overlap_token_size=128, max_token_size=1024
    ):
        tokens = self.encode_string(content)
        results = []
        for index, start in enumerate(
            range(0, len(tokens), max_token_size - overlap_token_size)
        ):
            chunk_content = self.decode_tokens(
                tokens[start : start + max_token_size]
            )
            results.append(
                {
                    "tokens": min(max_token_size, len(tokens) - start),
                    "content": chunk_content.strip(),
                    "chunk_order_index": index,
                }
            )
        return results