File size: 10,533 Bytes

cccb5a0

from datasets import load_dataset
from sympy import Line2D
from transformers import (
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    AutoModelForMaskedLM,
    TrainingArguments,
    Trainer,
    pipeline,
)
import evaluate
import numpy as np
import torch
import math

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

class MaskedLM():
    def __init__(self):
        self.model = None
        self.metric = None
        self.data_collator = None
        self.raw_data = None
        self.model_checkpoint = None
        self.tokenized_dataset = None
        self.chunk_size = 128
        self.chunks_dataset = None
        self.split_dataset = None
        self.args = None

    def load_dataset(self, name="imdb"):
        self.raw_data = load_dataset(name)
        print("Name of dataset: ", name)
        print(self.raw_data)
    
    def load_support(self, mlm_probability=0.15):
        self.model_checkpoint = "distilbert-base-uncased"
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_checkpoint)
        self.data_collator = DataCollatorForLanguageModeling(tokenizer=self.tokenizer, mlm_probability=mlm_probability)
        print("Name of model checkpoint: " + self.model_checkpoint)
        print("Tokenizer Fast: ", self.tokenizer.is_fast)
        print("Symbol of masked word after tokenizer: ", self.tokenizer.mask_token)
        print("Model max length tokenizer: ", self.tokenizer.model_max_length)
        
    def explore_infoModel(self, k=5):
        model = AutoModelForMaskedLM.from_pretrained(self.model_checkpoint)
        model_parameters = model.num_parameters() / 1000000
        print(f">>> Number of parameters of {self.model_checkpoint}: {round(model_parameters)}M")

        example = "This is a great [MASK]."
        print("\n")
        print(">>> Example: ", example)
        inputs = self.tokenizer(example, return_tensors='pt')
        token_logits = model(**inputs).logits
        print(f"{'Number of tokens: ':<{30}}{ len(inputs.tokens())}")
        print(f"{'Tokens prepare for training: ':<{30}}{ inputs.tokens()}")
        print(f"{'IDs of tokens: ':<{30}}{ inputs.input_ids}")
        print(f"{'Maked IDs of Model: ':<{30}}{self.tokenizer.mask_token_id}")
        print(f"{'Logits of example: ':<{30}}{token_logits}")
        print(f"{'Shape of logits: ':<{30}}{token_logits.size()}")

        '''Find POSITION of [MASK] => EXTRACT LOGIT'''
        mask_token_index = torch.where(inputs.input_ids == self.tokenizer.mask_token_id)[1]
        print(f"{'Position of masked token index: ':<{30}}{mask_token_index}")

        '''Find LOGIT of token in VOCAB suitable for [MASK]'''
        mask_token_logits = token_logits[0, mask_token_index, :]
        print(f"{'Logit of tokens in Vocab for [MASK]: ':<{30}}{mask_token_logits}")

        '''Choose TOP CANDIDATES for [MASK] with highest logits => TOP LOGITS + POSITION of token suitable for [MASK] in VOCAB'''
        top_k_values = torch.topk(mask_token_logits, k, dim=1).values[0].tolist()
        print(f"{'Top value of suitable token in Vocab: ':<{30}}{top_k_values }")
        top_k_tokens = torch.topk(mask_token_logits, k, dim=1).indices[0].tolist()
        print(f"{'Position of suitable token in Vocab: ':<{30}}{top_k_tokens}")

        '''Show TOP CANDIDATES'''
        for token in top_k_tokens:
            print(">>> ", example.replace(self.tokenizer.mask_token, self.tokenizer.decode([token])))


    def get_feature_items(self, set="train", index=0, feature="text"):
        return  None if self.raw_data[set][index][feature] is None or self.raw_data[set][index][feature] == 0 else self.raw_data[set][index][feature]
    
    def get_pair_items(self, set="train", index=0, feature1="text", feature2="label"):
        feature1 = self.get_feature_items(set, index, feature1)
        feature2 = self.get_feature_items(set, index, feature2)
        if feature2 is not None:
            line1 = ""
            line2 = ""
            for word, label in zip(feature1, feature2):
                line1 += str(word)
                line2 += str(label)
            return line1, line2
        
        return feature1, feature1 
    
    def get_tokenizer(self, set="train", index=0, feature="text"):
        inputs = self.tokenizer(self.get_feature_items(set, index, feature))
        return inputs.tokens(), inputs.word_ids()
    
    def tokenizer_dataset(self, example):
        inputs = self.tokenizer(example["text"])
        inputs["word_ids"] = [inputs.word_ids(i) for i in range(len(inputs["input_ids"]))]
        return inputs
    
    def map_tokenize_dataset(self):
        print("Start of processing dataset")
        self.tokenized_dataset = self.raw_data.map(self.tokenizer_dataset, batched=True, remove_columns=["text","label"] )
        print("Done mapping")
        print("Tokenized dataset: ", self.tokenized_dataset)
    
    def group_text_chunk(self, example):
        '''Group all of text'''
        concatenate_example = {k : sum(example[k], []) for k in example.keys()}

        '''Compute the length of all'''
        total_length = len(concatenate_example["input_ids"])
        '''Final length for chunk size'''
        total_length = (total_length // self.chunk_size) *self.chunk_size
    
        '''Divide into chunks with chunk size'''
        chunks = {
            k: [t[i: i + self.chunk_size] for i in range(0, total_length, self.chunk_size)]
            for k, t in concatenate_example.items()
        }

        '''Create LABELS column from INPUT_IDS'''
        chunks["labels"] = chunks["input_ids"].copy()
        return chunks

    def map_chunk_dataset(self):
       print("Start of processing dataset")
       self.chunks_dataset = self.tokenized_dataset.map(self.group_text_chunk, batched=True)
       print("Done mapping")
       print("Chunked dataset: ", self.chunks_dataset)

    def dataset_split(self, test_size=0.2):
        self.split_dataset = self.chunks_dataset["train"].train_test_split(
            test_size=test_size, seed=42
        )
        print("Preparing dataset: ", self.split_dataset)


    def create_model(self):
        print("Start creating model")
        self.model = AutoModelForMaskedLM.from_pretrained(self.model_checkpoint)
        print(self.model)

    def create_argumentTrainer(self, output_dir="fine_tuned_", eval_strategy="epoch", logging_strategy="epoch",
                               learning_rate=2e-5, num_train_epochs=20, weight_decay=0.01, batch_size=64,
                               save_strategy="epoch", push_to_hub=False, hub_model_id="", fp16=True):
        logging_steps = len(self.split_dataset["train"]) // batch_size
        self.args= TrainingArguments(
            #use_cpu=True,
            output_dir=f"{output_dir}{self.model_checkpoint}",
            overwrite_output_dir=True,
            eval_strategy=eval_strategy,
            save_strategy=save_strategy,
            weight_decay=weight_decay,
            learning_rate=learning_rate,
            num_train_epochs=num_train_epochs,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            push_to_hub=push_to_hub,
            hub_model_id=hub_model_id,
            fp16=fp16,
            logging_steps=logging_steps
        )
        print("Arguments ready for training")
        return self.args
    
    def call_train(self, model_path="pretrained_model_", set_train="train", set_val="test", push_to_hub=False, save_local=False):
        trainer = Trainer(
            model=self.model,
            args=self.args,
            train_dataset=self.split_dataset[set_train],
            eval_dataset=self.split_dataset[set_val],
            data_collator=self.data_collator,
            tokenizer=self.tokenizer,
        )
        eval_result1 = trainer.evaluate()
        print("Perplexity before of training: ", math.exp(eval_result1['eval_loss']))

        print("Start training")
        trainer.train()
        print("Done training")

        eval_result2 = trainer.evaluate()
        print("Perplexity after of training: ", math.exp(eval_result2['eval_loss']))

        if save_local:
            trainer.save_model(model_path+self.model_checkpoint)
            print("Done saving to local")
  
        if push_to_hub:
            trainer.push_to_hub(commit_message="Training complete")
            print("Done pushing push to hub")

    def call_pipeline(self, local=False, path="", example=""):
        if local:
            model_checkpoint = ""
        else:
            model_checkpoint = path
        mask_filler = pipeline(
            "fill-mask",
            model=model_checkpoint,
        )
        print(mask_filler(example))

if __name__ == "__main__":
    '''
        1_LOADING DATASET
    '''
    mlm = MaskedLM()
    mlm.load_dataset()
    print("-"*50, "Exploring information of Supporting", "-"*50)
    mlm.load_support()
    print("-"*50, "Exploring information of Supporting", "-"*50)
    '''
        2_EXPLORING DATASET, MODEL
    '''
    print("-"*50, "Exploring some information of Model", "-"*50)
    mlm.explore_infoModel()
    print("-"*50, "Exploring some information of Model", "-"*50)
    print("Example[0] (text) in dataset: ", mlm.get_feature_items(set="train", index=0, feature="text")[:100] + "...")
    print("Example[0] (label) in dataset: ", mlm.get_feature_items(set="train", index=0, feature="label"))
    line1, line2 = mlm.get_pair_items(set="train", index=1, feature1="text", feature2="label")
    print("--> Inp of Example[1]: ", line1[:20] + "...")
    print("--> Out of Example[1]: ", line2[:20]+ "...")
    '''
        3_PRE-PROCESSING DATASET, COMPUTE METRICS
    '''
    tokens, word_ids = mlm.get_tokenizer(set="train", index=0, feature="text")
    print("Tokens List of Example 0: ",tokens)
    print("Word IDs List of Example 0: ",word_ids)
    mlm.map_tokenize_dataset()
    mlm.map_chunk_dataset()
    mlm.dataset_split()
    '''
        4_INITIALIZATION MODEL
    '''
    print("-"*50, f"Information of {mlm.model_checkpoint}", "-"*50)
    mlm.create_model()
    print("-"*50, f"Information of {mlm.model_checkpoint}", "-"*50)
    '''
        5_SELECTION HYPERPARMETERS
    '''
    mlm.create_argumentTrainer(push_to_hub=True, hub_model_id="Chessmen/"+"fine_tune_" + mlm.model_checkpoint)
    mlm.call_train(save_local=True,push_to_hub=True)
    '''
        6_USE PRE-TRAINED MODEL
    '''
    mlm.call_pipeline(path="Chessmen/fine_tune_distilbert-base-uncased",example= "This is a great [MASK].")