| import config | |
| import torch | |
| class BERTDataset: | |
| def __init__(self, review, target): | |
| self.review = review | |
| self.target = target | |
| self.tokenizer = config.TOKENIZER | |
| self.max_len = config.MAX_LEN | |
| def __len__(self): | |
| return len(self.review) | |
| def __getitem__(self, item): | |
| review = str(self.review[item]) | |
| review = " ".join(review.split()) | |
| inputs = self.tokenizer.encode_plus( | |
| review, | |
| None, | |
| add_special_tokens=True, | |
| max_length=self.max_len | |
| ) | |
| ids = inputs["input_ids"] | |
| mask = inputs["attention_mask"] | |
| token_type_ids = inputs["token_type_ids"] | |
| padding_length = self.max_len - len(ids) | |
| ids = ids + ([0] * padding_length) | |
| mask = mask + ([0] * padding_length) | |
| token_type_ids = token_type_ids + ([0] * padding_length) | |
| return { | |
| 'ids': torch.tensor(ids, dtype=torch.long), | |
| 'mask': torch.tensor(mask, dtype=torch.long), | |
| 'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long), | |
| 'targets': torch.tensor(self.target[item], dtype=torch.float) | |
| } | |