crscardellino
/

flisol-cba-martin-fierro

+"""
+Various utilities needed for the presentation.
+"""
+from datasets import DatasetDict
+from transformers import PreTrainedTokenizerBase
+from typing import Callable, Dict, List
+def tokenize(tokenizer: PreTrainedTokenizerBase,
+             end_char: str = '\n') -> Callable[[Dict[str, List[str]]], DatasetDict]:
+    """
+    Helper function that returns a function to use with the `map` method of
+    datasets.DatasetDict.  It takes a tokenizer and generates a function that
+    applies that tokenizer with an optional `end_char` parameter (e.g. a
+    newline) that might be needed (e.g. when trying to tokenize and keep the
+    structure of a poem wich needs the newline after each sentence). This is
+    needed since the function `datasets.load_dataset` forcibly removes the
+    newlines characters.
+    Parameters
+    ----------
+    tokenizer : PreTrainedTokenizerBase
+        The tokenizer to use for the tokenization process.
+    end_char : str
+        The end character to append to each line.
+    Returns
+    -------
+    Callable[[Dict[str, List[str]]], DatasetDict]
+        The function in charge of the tokenization process.
+    """
+    def _tokenize(examples: Dict[str, List[str]]) -> DatasetDict:
+        return tokenizer([f'{e}{end_char}' for e in examples['text']])
+    return _tokenize
+def group_texts(examples: Dict[str, List[int]],
+                block_size: int = 128) -> Dict[str, List[int]]:
+    """
+    Helper function to concatenate a tokenized dataset (with the function above)
+    in chunks of `block_size`. The code was taken from
+    https://github.com/huggingface/notebooks/blob/main/examples/language_modeling.ipynb
+    Parameters
+    ----------
+    examples : Dict[str, List[int]]
+        This is actually a `LazyBatch` from the transformers library, that is
+        given by the `DatasetDict.map` method. It should be the dataset returned
+        after tokenization with the function returned by `tokenize`. It should
+        have 2 main keys: 'input_ids' and 'attention_mask'.
+    block_size : int
+        The size of the block to use in the training process. If the total lenght
+        of the group of texts is not divisible by the block size it will ignore the
+        remaining data for simplicity.
+    Returns
+    -------
+    Dict[str, List[str, int]]
+        The dictionary that will provide the new dataset divided in chunks of
+        `block_size`.
+    """
+    # Concatenate all texts.
+    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
+    total_length = len(concatenated_examples['input_ids'])
+    # We drop the small remainder, we could add padding if the model supported
+    # it instead of this drop, you can customize this part to your needs
+    total_length = (total_length // block_size) * block_size
+    # Split by chunks of block_size length
+    result = {
+        k: [t[i:i + block_size] for i in range(0, total_length, block_size)]
+        for k, t in concatenated_examples.items()
+    }
+    # labels to be used by the training phase, it copies since the Transformers
+    # library will be in charge of making the shift to the right
+    result["labels"] = result["input_ids"].copy()
+    return result