Commit
·
2b999a0
1
Parent(s):
72b71cf
utils module
Browse files
utils.py
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Various utilities needed for the presentation.
|
3 |
+
"""
|
4 |
+
|
5 |
+
from datasets import DatasetDict
|
6 |
+
from transformers import PreTrainedTokenizerBase
|
7 |
+
from typing import Callable, Dict, List
|
8 |
+
|
9 |
+
|
10 |
+
def tokenize(tokenizer: PreTrainedTokenizerBase,
|
11 |
+
end_char: str = '\n') -> Callable[[Dict[str, List[str]]], DatasetDict]:
|
12 |
+
"""
|
13 |
+
Helper function that returns a function to use with the `map` method of
|
14 |
+
datasets.DatasetDict. It takes a tokenizer and generates a function that
|
15 |
+
applies that tokenizer with an optional `end_char` parameter (e.g. a
|
16 |
+
newline) that might be needed (e.g. when trying to tokenize and keep the
|
17 |
+
structure of a poem wich needs the newline after each sentence). This is
|
18 |
+
needed since the function `datasets.load_dataset` forcibly removes the
|
19 |
+
newlines characters.
|
20 |
+
|
21 |
+
Parameters
|
22 |
+
----------
|
23 |
+
tokenizer : PreTrainedTokenizerBase
|
24 |
+
The tokenizer to use for the tokenization process.
|
25 |
+
end_char : str
|
26 |
+
The end character to append to each line.
|
27 |
+
|
28 |
+
Returns
|
29 |
+
-------
|
30 |
+
Callable[[Dict[str, List[str]]], DatasetDict]
|
31 |
+
The function in charge of the tokenization process.
|
32 |
+
|
33 |
+
"""
|
34 |
+
def _tokenize(examples: Dict[str, List[str]]) -> DatasetDict:
|
35 |
+
return tokenizer([f'{e}{end_char}' for e in examples['text']])
|
36 |
+
|
37 |
+
return _tokenize
|
38 |
+
|
39 |
+
|
40 |
+
def group_texts(examples: Dict[str, List[int]],
|
41 |
+
block_size: int = 128) -> Dict[str, List[int]]:
|
42 |
+
"""
|
43 |
+
Helper function to concatenate a tokenized dataset (with the function above)
|
44 |
+
in chunks of `block_size`. The code was taken from
|
45 |
+
https://github.com/huggingface/notebooks/blob/main/examples/language_modeling.ipynb
|
46 |
+
|
47 |
+
Parameters
|
48 |
+
----------
|
49 |
+
examples : Dict[str, List[int]]
|
50 |
+
This is actually a `LazyBatch` from the transformers library, that is
|
51 |
+
given by the `DatasetDict.map` method. It should be the dataset returned
|
52 |
+
after tokenization with the function returned by `tokenize`. It should
|
53 |
+
have 2 main keys: 'input_ids' and 'attention_mask'.
|
54 |
+
block_size : int
|
55 |
+
The size of the block to use in the training process. If the total lenght
|
56 |
+
of the group of texts is not divisible by the block size it will ignore the
|
57 |
+
remaining data for simplicity.
|
58 |
+
|
59 |
+
Returns
|
60 |
+
-------
|
61 |
+
Dict[str, List[str, int]]
|
62 |
+
The dictionary that will provide the new dataset divided in chunks of
|
63 |
+
`block_size`.
|
64 |
+
"""
|
65 |
+
# Concatenate all texts.
|
66 |
+
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
|
67 |
+
total_length = len(concatenated_examples['input_ids'])
|
68 |
+
# We drop the small remainder, we could add padding if the model supported
|
69 |
+
# it instead of this drop, you can customize this part to your needs
|
70 |
+
total_length = (total_length // block_size) * block_size
|
71 |
+
# Split by chunks of block_size length
|
72 |
+
result = {
|
73 |
+
k: [t[i:i + block_size] for i in range(0, total_length, block_size)]
|
74 |
+
for k, t in concatenated_examples.items()
|
75 |
+
}
|
76 |
+
# labels to be used by the training phase, it copies since the Transformers
|
77 |
+
# library will be in charge of making the shift to the right
|
78 |
+
result["labels"] = result["input_ids"].copy()
|
79 |
+
return result
|