# coding=utf-8 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Multiple choice fine-tuning: utilities to work with multiple choice tasks of reading comprehension """ import csv import glob import json import logging import os from typing import List import tqdm from transformers import PreTrainedTokenizer logger = logging.getLogger(__name__) class InputExample(object): """A single training/test example for multiple choice""" def __init__(self, example_id, question, contexts, endings, label=None): """Constructs a InputExample. Args: example_id: Unique id for the example. contexts: list of str. The untokenized text of the first sequence (context of corresponding question). question: string. The untokenized text of the second sequence (question). endings: list of str. multiple choice's options. Its length must be equal to contexts' length. label: (Optional) string. The label of the example. This should be specified for train and dev examples, but not for test examples. """ self.example_id = example_id self.question = question self.contexts = contexts self.endings = endings self.label = label class InputFeatures(object): def __init__(self, example_id, choices_features, label): self.example_id = example_id self.choices_features = [ {"input_ids": input_ids, "input_mask": input_mask, "segment_ids": segment_ids} for input_ids, input_mask, segment_ids in choices_features ] self.label = label class DataProcessor(object): """Base class for data converters for multiple choice data sets.""" def get_train_examples(self, data_dir): """Gets a collection of `InputExample`s for the train set.""" raise NotImplementedError() def get_dev_examples(self, data_dir): """Gets a collection of `InputExample`s for the dev set.""" raise NotImplementedError() def get_test_examples(self, data_dir): """Gets a collection of `InputExample`s for the test set.""" raise NotImplementedError() def get_labels(self): """Gets the list of labels for this data set.""" raise NotImplementedError() class RaceProcessor(DataProcessor): """Processor for the RACE data set.""" def get_train_examples(self, data_dir): """See base class.""" logger.info("LOOKING AT {} train".format(data_dir)) high = os.path.join(data_dir, "train/high") middle = os.path.join(data_dir, "train/middle") high = self._read_txt(high) middle = self._read_txt(middle) return self._create_examples(high + middle, "train") def get_dev_examples(self, data_dir): """See base class.""" logger.info("LOOKING AT {} dev".format(data_dir)) high = os.path.join(data_dir, "dev/high") middle = os.path.join(data_dir, "dev/middle") high = self._read_txt(high) middle = self._read_txt(middle) return self._create_examples(high + middle, "dev") def get_test_examples(self, data_dir): """See base class.""" logger.info("LOOKING AT {} test".format(data_dir)) high = os.path.join(data_dir, "test/high") middle = os.path.join(data_dir, "test/middle") high = self._read_txt(high) middle = self._read_txt(middle) return self._create_examples(high + middle, "test") def get_labels(self): """See base class.""" return ["0", "1", "2", "3"] def _read_txt(self, input_dir): lines = [] files = glob.glob(input_dir + "/*txt") for file in tqdm.tqdm(files, desc="read files"): with open(file, "r", encoding="utf-8") as fin: data_raw = json.load(fin) data_raw["race_id"] = file lines.append(data_raw) return lines def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (_, data_raw) in enumerate(lines): race_id = "%s-%s" % (set_type, data_raw["race_id"]) article = data_raw["article"] for i in range(len(data_raw["answers"])): truth = str(ord(data_raw["answers"][i]) - ord("A")) question = data_raw["questions"][i] options = data_raw["options"][i] examples.append( InputExample( example_id=race_id, question=question, contexts=[article, article, article, article], # this is not efficient but convenient endings=[options[0], options[1], options[2], options[3]], label=truth, ) ) return examples class SwagProcessor(DataProcessor): """Processor for the SWAG data set.""" def get_train_examples(self, data_dir): """See base class.""" logger.info("LOOKING AT {} train".format(data_dir)) return self._create_examples(self._read_csv(os.path.join(data_dir, "train.csv")), "train") def get_dev_examples(self, data_dir): """See base class.""" logger.info("LOOKING AT {} dev".format(data_dir)) return self._create_examples(self._read_csv(os.path.join(data_dir, "val.csv")), "dev") def get_test_examples(self, data_dir): """See base class.""" logger.info("LOOKING AT {} dev".format(data_dir)) raise ValueError( "For swag testing, the input file does not contain a label column. It can not be tested in current code" "setting!" ) return self._create_examples(self._read_csv(os.path.join(data_dir, "test.csv")), "test") def get_labels(self): """See base class.""" return ["0", "1", "2", "3"] def _read_csv(self, input_file): with open(input_file, "r", encoding="utf-8") as f: return list(csv.reader(f)) def _create_examples(self, lines: List[List[str]], type: str): """Creates examples for the training and dev sets.""" if type == "train" and lines[0][-1] != "label": raise ValueError("For training, the input file must contain a label column.") examples = [ InputExample( example_id=line[2], question=line[5], # in the swag dataset, the # common beginning of each # choice is stored in "sent2". contexts=[line[4], line[4], line[4], line[4]], endings=[line[7], line[8], line[9], line[10]], label=line[11], ) for line in lines[1:] # we skip the line with the column names ] return examples class ArcProcessor(DataProcessor): """Processor for the ARC data set (request from allennlp).""" def get_train_examples(self, data_dir): """See base class.""" logger.info("LOOKING AT {} train".format(data_dir)) return self._create_examples(self._read_json(os.path.join(data_dir, "train.jsonl")), "train") def get_dev_examples(self, data_dir): """See base class.""" logger.info("LOOKING AT {} dev".format(data_dir)) return self._create_examples(self._read_json(os.path.join(data_dir, "dev.jsonl")), "dev") def get_test_examples(self, data_dir): logger.info("LOOKING AT {} test".format(data_dir)) return self._create_examples(self._read_json(os.path.join(data_dir, "test.jsonl")), "test") def get_labels(self): """See base class.""" return ["0", "1", "2", "3"] def _read_json(self, input_file): with open(input_file, "r", encoding="utf-8") as fin: lines = fin.readlines() return lines def _create_examples(self, lines, type): """Creates examples for the training and dev sets.""" # There are two types of labels. They should be normalized def normalize(truth): if truth in "ABCD": return ord(truth) - ord("A") elif truth in "1234": return int(truth) - 1 else: logger.info("truth ERROR! %s", str(truth)) return None examples = [] three_choice = 0 four_choice = 0 five_choice = 0 other_choices = 0 # we deleted example which has more than or less than four choices for line in tqdm.tqdm(lines, desc="read arc data"): data_raw = json.loads(line.strip("\n")) if len(data_raw["question"]["choices"]) == 3: three_choice += 1 continue elif len(data_raw["question"]["choices"]) == 5: five_choice += 1 continue elif len(data_raw["question"]["choices"]) != 4: other_choices += 1 continue four_choice += 1 truth = str(normalize(data_raw["answerKey"])) assert truth != "None" question_choices = data_raw["question"] question = question_choices["stem"] id = data_raw["id"] options = question_choices["choices"] if len(options) == 4: examples.append( InputExample( example_id=id, question=question, contexts=[ options[0]["para"].replace("_", ""), options[1]["para"].replace("_", ""), options[2]["para"].replace("_", ""), options[3]["para"].replace("_", ""), ], endings=[options[0]["text"], options[1]["text"], options[2]["text"], options[3]["text"]], label=truth, ) ) if type == "train": assert len(examples) > 1 assert examples[0].label is not None logger.info("len examples: %s}", str(len(examples))) logger.info("Three choices: %s", str(three_choice)) logger.info("Five choices: %s", str(five_choice)) logger.info("Other choices: %s", str(other_choices)) logger.info("four choices: %s", str(four_choice)) return examples def convert_examples_to_features( examples: List[InputExample], label_list: List[str], max_length: int, tokenizer: PreTrainedTokenizer, pad_token_segment_id=0, pad_on_left=False, pad_token=0, mask_padding_with_zero=True, ) -> List[InputFeatures]: """ Loads a data file into a list of `InputFeatures` """ label_map = {label: i for i, label in enumerate(label_list)} features = [] for (ex_index, example) in tqdm.tqdm(enumerate(examples), desc="convert examples to features"): if ex_index % 10000 == 0: logger.info("Writing example %d of %d" % (ex_index, len(examples))) choices_features = [] for ending_idx, (context, ending) in enumerate(zip(example.contexts, example.endings)): text_a = context if example.question.find("_") != -1: # this is for cloze question text_b = example.question.replace("_", ending) else: text_b = example.question + " " + ending inputs = tokenizer.encode_plus(text_a, text_b, add_special_tokens=True, max_length=max_length,) if "num_truncated_tokens" in inputs and inputs["num_truncated_tokens"] > 0: logger.info( "Attention! you are cropping tokens (swag task is ok). " "If you are training ARC and RACE and you are poping question + options," "you need to try to use a bigger max seq length!" ) input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"] # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) # Zero-pad up to the sequence length. padding_length = max_length - len(input_ids) if pad_on_left: input_ids = ([pad_token] * padding_length) + input_ids attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids else: input_ids = input_ids + ([pad_token] * padding_length) attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length) token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length) assert len(input_ids) == max_length assert len(attention_mask) == max_length assert len(token_type_ids) == max_length choices_features.append((input_ids, attention_mask, token_type_ids)) label = label_map[example.label] if ex_index < 2: logger.info("*** Example ***") logger.info("race_id: {}".format(example.example_id)) for choice_idx, (input_ids, attention_mask, token_type_ids) in enumerate(choices_features): logger.info("choice: {}".format(choice_idx)) logger.info("input_ids: {}".format(" ".join(map(str, input_ids)))) logger.info("attention_mask: {}".format(" ".join(map(str, attention_mask)))) logger.info("token_type_ids: {}".format(" ".join(map(str, token_type_ids)))) logger.info("label: {}".format(label)) features.append(InputFeatures(example_id=example.example_id, choices_features=choices_features, label=label,)) return features processors = {"race": RaceProcessor, "swag": SwagProcessor, "arc": ArcProcessor} MULTIPLE_CHOICE_TASKS_NUM_LABELS = {"race", 4, "swag", 4, "arc", 4}