File size: 2,549 Bytes
a84a65c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import torch

class AbsTokenizer(torch.nn.Module):
    """
    This is the virtual tokenizer class.
    Other tokenizers should inherit this class.
    typicially:
        Text -> BPE
        Text -> Phone
        Audio -> Codec
        Image -> Codec
        ...
    """

    @property
    def is_discrete(self):
        """ 
        Return True if the results are discrete token-ids: e.g., BPE / Phone / Codec 
        Return False if the results are continuous embeddings: e.g., RoBERTa embeddings
        """
        raise NotImplementedError

    @property
    def codebook_length(self):
        """
        Return 0 if "self.is_discrete is False",
        otherwise returns the length of codebook.
        e.g., for audio codec that adopts 4 codebooks, each of which is in size of 1024,
          this is 4 * 1024
        This is used to create the shared vocabulary for softmax
        """
        raise NotImplementedError

    def find_length(self, x):
        """
        This method quickly returns the length of the output (usually without tokenization)
        This method is used in batchfying process: measure the whole length of the example
        typically:
            number of BPE / Frames / Codec sequence / Embedding lengths
        """
        raise NotImplementedError

    def tokenize(self, x):
        """ Do tokenization.
            typically, x can be any input type, e.g.,
                text: which is a path of the audio
                text: which is the exact text data for BPE / G2P
                Tensor: the loaded data. e.g., audio 
            Returns 1-D LONG tensor when this is discrete
            Returns 2-D FLOAT tensor when this is continuous: [length, embedding_size]
        """
        raise NotImplementedError

    def tokenize_batch(self, xs, lengths=None):
        """ batch version of tokenization
            Implementation of this method is optional, as it will only be used offline.
 
            warning: you should verify that the results of 'tokenize_batch' and 'tokenize'
            are actually (or roughly) identical (i.g., padding will not effect the results)

            return: list of 'tokenize' results. do NOT make it as a batched Tensor
        """
        raise NotImplementedError

    def detokenize(self, x):
        """ This method recovers the original input based on the 'tokenize' result 
            Implementation of this method is optional, as some tokenization process
            is not recoverable. i.g., hubert
        """
        raise NotImplementedError