botcon
/

LUKE_squad_finetuned_qa_tf32

Question Answering

Transformers

PyTorch

luke

Model card Files Files and versions Community

botcon commited on Nov 8, 2023

Commit

4a8a37c

1 Parent(s): 58958df

Delete LukeQuestionAnswering.py

Browse files

Files changed (1) hide show

LukeQuestionAnswering.py +0 -431

LukeQuestionAnswering.py DELETED Viewed

@@ -1,431 +0,0 @@
-from transformers import LukePreTrainedModel, LukeModel, AutoTokenizer, TrainingArguments, default_data_collator, Trainer, LukeForQuestionAnswering
-from transformers.modeling_outputs import ModelOutput
-from typing import Optional, Tuple, Union
-import numpy as np
-from tqdm import tqdm
-import evaluate
-import torch
-from dataclasses import dataclass
-from datasets import load_dataset
-from torch import nn
-from torch.nn import CrossEntropyLoss
-import collections
-PEFT = False
-repo_name = "LUKE_squad_finetuned_qa"
-tf32 = True
-fp16= True
-train = False
-test = True
-trained_model = "LUKE_squad_finetuned_qa_tf32"
-torch.backends.cuda.matmul.allow_tf32 = tf32
-torch.backends.cudnn.allow_tf32 = tf32
-device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
-if tf32:
-    repo_name += "_tf32"
-# https://github.com/huggingface/transformers/blob/v4.34.1/src/transformers/models/luke/modeling_luke.py#L319-L353
-# Taken from HF repository, easier to include additional features -- Currently identical to LukeForQuestionAnswering by HF
-@dataclass
-class LukeQuestionAnsweringModelOutput(ModelOutput):
-    """
-    Outputs of question answering models.
-    Args:
-        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
-            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
-        start_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
-            Span-start scores (before SoftMax).
-        end_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
-            Span-end scores (before SoftMax).
-        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
-            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
-            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
-        entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
-            shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
-            layer plus the initial entity embedding outputs.
-        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-    loss: Optional[torch.FloatTensor] = None
-    start_logits: torch.FloatTensor = None
-    end_logits: torch.FloatTensor = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    entity_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-class AugmentedLukeForQuestionAnswering(LukePreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        # This is 2.
-        self.num_labels = config.num_labels
-        self.luke = LukeModel(config, add_pooling_layer=False)
-        '''
-        Any improvement to the model are expected here. Additional features, anything...
-        '''
-        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
-        # Initialize weights and apply final processing
-        self.post_init()
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        token_type_ids: Optional[torch.LongTensor] = None,
-        position_ids: Optional[torch.FloatTensor] = None,
-        entity_ids: Optional[torch.LongTensor] = None,
-        entity_attention_mask: Optional[torch.FloatTensor] = None,
-        entity_token_type_ids: Optional[torch.LongTensor] = None,
-        entity_position_ids: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        start_positions: Optional[torch.LongTensor] = None,
-        end_positions: Optional[torch.LongTensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, LukeQuestionAnsweringModelOutput]:
-        r"""
-        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
-            are not taken into account for computing the loss.
-        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
-            are not taken into account for computing the loss.
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        outputs = self.luke(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            entity_ids=entity_ids,
-            entity_attention_mask=entity_attention_mask,
-            entity_token_type_ids=entity_token_type_ids,
-            entity_position_ids=entity_position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=True,
-        )
-        sequence_output = outputs.last_hidden_state
-        logits = self.qa_outputs(sequence_output)
-        start_logits, end_logits = logits.split(1, dim=-1)
-        start_logits = start_logits.squeeze(-1)
-        end_logits = end_logits.squeeze(-1)
-        total_loss = None
-        if start_positions is not None and end_positions is not None:
-            # If we are on multi-GPU, split add a dimension
-            if len(start_positions.size()) > 1:
-                start_positions = start_positions.squeeze(-1)
-            if len(end_positions.size()) > 1:
-                end_positions = end_positions.squeeze(-1)
-            # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = start_logits.size(1)
-            start_positions.clamp_(0, ignored_index)
-            end_positions.clamp_(0, ignored_index)
-            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
-            start_loss = loss_fct(start_logits, start_positions)
-            end_loss = loss_fct(end_logits, end_positions)
-            total_loss = (start_loss + end_loss) / 2
-        if not return_dict:
-            return tuple(
-                v
-                for v in [
-                    total_loss,
-                    start_logits,
-                    end_logits,
-                    outputs.hidden_states,
-                    outputs.entity_hidden_states,
-                    outputs.attentions,
-                ]
-                if v is not None
-            )
-        return LukeQuestionAnsweringModelOutput(
-            loss=total_loss,
-            start_logits=start_logits,
-            end_logits=end_logits,
-            hidden_states=outputs.hidden_states,
-            entity_hidden_states=outputs.entity_hidden_states,
-            attentions=outputs.attentions,
-        )
-if __name__ == "__main__":
-    # Setting up tokenizer and helper functions
-    # Work-around for FastTokenizer - RoBERTa and LUKE share the same subword vocab, and we are not using entities functions of LUKE-tokenizer anyways
-    tokenizer = AutoTokenizer.from_pretrained("roberta-base")
-    # Necessary initialization
-    max_length = 384
-    stride = 128
-    batch_size = 8
-    n_best = 20
-    max_answer_length = 30
-    metric = evaluate.load("squad")
-    raw_datasets = load_dataset("squad")
-    def compute_metrics(start_logits, end_logits, features, examples):
-        example_to_features = collections.defaultdict(list)
-        for idx, feature in enumerate(features):
-            example_to_features[feature["example_id"]].append(idx)
-        predicted_answers = []
-        for example in tqdm(examples):
-            example_id = example["id"]
-            context = example["context"]
-            answers = []
-            # Loop through all features associated with that example
-            for feature_index in example_to_features[example_id]:
-                start_logit = start_logits[feature_index]
-                end_logit = end_logits[feature_index]
-                offsets = features[feature_index]["offset_mapping"]
-                start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
-                end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
-                for start_index in start_indexes:
-                    for end_index in end_indexes:
-                        # Skip answers that are not fully in the context
-                        if offsets[start_index] is None or offsets[end_index] is None:
-                            continue
-                        # Skip answers with a length that is either < 0 or > max_answer_length
-                        if (
-                            end_index < start_index
-                            or end_index - start_index + 1 > max_answer_length
-                        ):
-                            continue
-                        answer = {
-                            "text": context[offsets[start_index][0] : offsets[end_index][1]],
-                            "logit_score": start_logit[start_index] + end_logit[end_index],
-                        }
-                        answers.append(answer)
-            # Select the answer with the best score
-            if len(answers) > 0:
-                best_answer = max(answers, key=lambda x: x["logit_score"])
-                predicted_answers.append(
-                    {"id": example_id, "prediction_text": best_answer["text"]}
-                )
-            else:
-                predicted_answers.append({"id": example_id, "prediction_text": ""})
-        theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
-        return metric.compute(predictions=predicted_answers, references=theoretical_answers)
-    def preprocess_training_examples(examples):
-        questions = [q.strip() for q in examples["question"]]
-        inputs = tokenizer(
-            questions,
-            examples["context"],
-            max_length=max_length,
-            truncation="only_second",
-            stride=stride,
-            return_overflowing_tokens=True,
-            return_offsets_mapping=True,
-            padding="max_length",
-        )
-        offset_mapping = inputs.pop("offset_mapping")
-        sample_map = inputs.pop("overflow_to_sample_mapping")
-        answers = examples["answers"]
-        start_positions = []
-        end_positions = []
-        for i, offset in enumerate(offset_mapping):
-            sample_idx = sample_map[i]
-            answer = answers[sample_idx]
-            start_char = answer["answer_start"][0]
-            end_char = answer["answer_start"][0] + len(answer["text"][0])
-            sequence_ids = inputs.sequence_ids(i)
-            # Find the start and end of the context
-            idx = 0
-            while sequence_ids[idx] != 1:
-                idx += 1
-            context_start = idx
-            while sequence_ids[idx] == 1:
-                idx += 1
-            context_end = idx - 1
-            # If the answer is not fully inside the context, label is (0, 0)
-            if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
-                start_positions.append(0)
-                end_positions.append(0)
-            else:
-                # Otherwise it's the start and end token positions
-                idx = context_start
-                while idx <= context_end and offset[idx][0] <= start_char:
-                    idx += 1
-                start_positions.append(idx - 1)
-                idx = context_end
-                while idx >= context_start and offset[idx][1] >= end_char:
-                    idx -= 1
-                end_positions.append(idx + 1)
-        inputs["start_positions"] = start_positions
-        inputs["end_positions"] = end_positions
-        return inputs
-    def preprocess_validation_examples(examples):
-        questions = [q.strip() for q in examples["question"]]
-        inputs = tokenizer(
-            questions,
-            examples["context"],
-            max_length=max_length,
-            truncation="only_second",
-            stride=stride,
-            return_overflowing_tokens=True,
-            return_offsets_mapping=True,
-            padding="max_length",
-        )
-        sample_map = inputs.pop("overflow_to_sample_mapping")
-        example_ids = []
-        for i in range(len(inputs["input_ids"])):
-            sample_idx = sample_map[i]
-            example_ids.append(examples["id"][sample_idx])
-            sequence_ids = inputs.sequence_ids(i)
-            offset = inputs["offset_mapping"][i]
-            inputs["offset_mapping"][i] = [
-                o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
-            ]
-        inputs["example_id"] = example_ids
-        return inputs
-    if train:
-        base_luke = "studio-ousia/luke-base"
-        # tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
-        model = AugmentedLukeForQuestionAnswering.from_pretrained(base_luke).to(device)
-        train_dataset = raw_datasets["train"].map(
-            preprocess_training_examples,
-            batched=True,
-            remove_columns=raw_datasets["train"].column_names,
-        )
-        validation_dataset = raw_datasets["validation"].map(
-            preprocess_validation_examples,
-            batched=True,
-            remove_columns=raw_datasets["validation"].column_names,
-        )
-        # --------------- PEFT -------------------- # One epoch without PEFT took about 2h on my computer with CUDA - performance of PEFT kinda ass though
-        if PEFT:
-            from peft import get_peft_config, get_peft_model, LoraConfig, TaskType
-            # ---- For all linear layers ----
-            import re
-            pattern = r'\((\w+)\): Linear'
-            linear_layers = re.findall(pattern, str(model.modules))
-            target_modules = list(set(linear_layers))
-            # If using peft, can consider increaisng r for better performance
-            peft_config = LoraConfig(
-                task_type=TaskType.QUESTION_ANS, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1, target_modules=target_modules, bias='all'
-            )
-            model = get_peft_model(model, peft_config)
-            model.print_trainable_parameters()
-            repo_name += "_PEFT"
-        # ------------------------------------------ #
-        args = TrainingArguments(
-            repo_name,
-            evaluation_strategy = "no",
-            save_strategy="epoch",
-            learning_rate=2e-5,
-            per_device_train_batch_size=batch_size,
-            per_device_eval_batch_size=batch_size,
-            num_train_epochs=3,
-            weight_decay=0.01,
-            push_to_hub=True,
-            fp16=fp16
-        )
-        trainer = Trainer(
-            model,
-            args,
-            train_dataset=train_dataset,
-            eval_dataset=validation_dataset,
-            data_collator=default_data_collator,
-            tokenizer=tokenizer
-        )
-        trainer.train()
-    elif test:
-        model = AugmentedLukeForQuestionAnswering.from_pretrained(trained_model).to(device)
-        interval = len(raw_datasets["validation"]) // 100
-        exact_match = 0
-        f1 = 0
-        with torch.no_grad():
-            for i in range(1, 101):
-                start = interval * (i - 1)
-                end = interval * i
-                small_eval_set = raw_datasets["validation"].select(range(start ,end))
-                eval_set = small_eval_set.map(
-                    preprocess_validation_examples,
-                    batched=True,
-                    remove_columns=raw_datasets["validation"].column_names
-                )
-                eval_set_for_model = eval_set.remove_columns(["example_id", "offset_mapping"])
-                eval_set_for_model.set_format("torch")
-                batch = {k: eval_set_for_model[k].to(device) for k in eval_set_for_model.column_names}
-                outputs = model(**batch)
-                start_logits = outputs.start_logits.cpu().numpy()
-                end_logits = outputs.end_logits.cpu().numpy()
-                res = compute_metrics(start_logits, end_logits, eval_set, small_eval_set)
-                exact_match += res['exact_match']
-                f1 += res["f1"]
-        print("F1 score: {}".format(f1 / 100))
-        print("Exact match: {}".format(exact_match / 100))