Spaces:
Sleeping
Sleeping
File size: 7,375 Bytes
280d87f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 |
import json
import random
from collections import defaultdict
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
TensorDataset)
from .config_train import DEFAULT_TEXT_ANNOTATION_FILE, batch_size, tokenizer
with open(DEFAULT_TEXT_ANNOTATION_FILE, 'r', encoding='utf-8') as file:
data = json.load(file)
# Prepare sentences and labels
sentences = [item[0] for item in data["annotations"]]
"""
List[str]: A list of sentences extracted from the dataset.
Each sentence corresponds to an annotation in the dataset.
"""
labels = [item[1]['entities'] for item in data["annotations"]]
"""
List[List[Tuple[str, str]]]: A list of entity labels for each sentence.
Each label is a tuple containing the entity text and its corresponding tag.
"""
# Define tags
tags = data["classes"]
"""
List[str]: A list of all possible entity tags (classes) in the dataset.
These tags will be used to label the tokens in each sentence.
"""
# Convert tags to indices
tag2idx = {tag: 0 for idx, tag in enumerate(tags)}
for label in labels:
for entity in label:
tag2idx[entity[1]] = tag2idx[entity[1]] + 1
# Sort the dictionary by values
sorted_tags = dict(sorted(tag2idx.items(), key=lambda item: item[1],reverse=True))
sorted_tags = {key: value for key, value in sorted_tags.items() if value != 0}
new_tag = {'<pad>': 0}
sorted_tags = {**new_tag, **sorted_tags}
# Convert tags to indices
tag2idx = {tag: idx for idx, tag in enumerate(list(sorted_tags.keys()))}
"""
Dict[str, int]: A dictionary mapping each tag to a unique index.
This is used to convert tag labels into numerical format for model training.
"""
# Count the occurrences of each tag
tag_counts = defaultdict(int)
for item in data["annotations"]:
for entity in item[1]["entities"]:
tag_counts[entity[1]] += 1
# Create a list of annotations for each tag
tag_annotations = defaultdict(list)
for item in data["annotations"]:
for entity in item[1]["entities"]:
tag_annotations[entity[1]].append(item)
# Split the annotations for each tag into training and testing sets
train_annotations = []
test_annotations = []
for tag, annotations in tag_annotations.items():
random.shuffle(annotations)
split_point = len(annotations) // 2
train_annotations.extend(annotations[:split_point])
test_annotations.extend(annotations[split_point:])
# Prepare training and testing datasets
sentences_train = [item[0] for item in train_annotations]
sentences_test = [item[0] for item in test_annotations]
labels_train = [item[1]['entities'] for item in train_annotations]
labels_test = [item[1]['entities'] for item in test_annotations]
# Encode the sentences and labels into input IDs and tag IDs
input_ids_train = []
tag_ids_train = []
for index in range(len(sentences_train)):
input_ids = tokenizer.encode(sentences_train[index]) # Tokenize the sentence
input_ids_train.append(input_ids)
tag_ids = [0 for ids in input_ids] # Initialize tag IDs with padding
for labels in labels_train[index]:
key_ids = tokenizer.encode(labels[0]) # Encode entity text
for ids in key_ids[1:len(key_ids)-1]:
ids = input_ids.index(ids) # Find the index of the token
tag_ids[ids] = tag2idx[labels[1]] # Assign tag ID
tag_ids_train.append(tag_ids)
input_ids_test = []
tag_ids_test = []
for index in range(len(sentences_test)):
input_ids = tokenizer.encode(sentences_test[index]) # Tokenize the sentence
input_ids_test.append(input_ids)
tag_ids = [0 for ids in input_ids] # Initialize tag IDs with padding
for labels in labels_test[index]:
key_ids = tokenizer.encode(labels[0]) # Encode entity text
for ids in key_ids[1:len(key_ids)-1]:
ids = input_ids.index(ids) # Find the index of the token
tag_ids[ids] = tag2idx[labels[1]] # Assign tag ID
tag_ids_test.append(tag_ids)
"""
padded torch.Tensor: A padded tensor of tag IDs for the training set.
All tag sequences are padded to the same length as their corresponding token sequences.
"""
# Pad sequences
padded_input_ids_train = pad_sequence([torch.tensor(ids) for ids in input_ids_train], batch_first=True, padding_value=0)
# Convert to tensor
input_ids_train= padded_input_ids_train.clone().detach()
# Pad sequences
padded_input_ids_test = pad_sequence([torch.tensor(ids) for ids in input_ids_test], batch_first=True, padding_value=0)
# Convert to tensor
input_ids_test= padded_input_ids_test.clone().detach()
# Pad sequences
padded_tag_ids_train = pad_sequence([torch.tensor(ids) for ids in tag_ids_train], batch_first=True, padding_value=0)
# Convert to tensor
tag_ids_train= padded_tag_ids_train.clone().detach()
# Pad sequences
padded_tag_ids_test = pad_sequence([torch.tensor(ids) for ids in tag_ids_test], batch_first=True, padding_value=0)
# Convert to tensor
tag_ids_test= padded_tag_ids_test.clone().detach()
# Find the maximum sequence length
max_length = max(padded_input_ids_train.size(1), padded_input_ids_train.size(1), padded_tag_ids_train.size(1), padded_tag_ids_test.size(1))
# Pad input_ids and tag_ids to have the same length
padded_input_ids_train = torch.nn.functional.pad(padded_input_ids_train, (0, max_length - padded_input_ids_train.size(1)), value=tokenizer.pad_token_id)
padded_input_ids_test = torch.nn.functional.pad(padded_input_ids_test, (0, max_length - padded_input_ids_test.size(1)), value=tokenizer.pad_token_id)
padded_tag_ids_train = torch.nn.functional.pad(padded_tag_ids_train, (0, max_length - padded_tag_ids_train.size(1)), value=0)
padded_tag_ids_test = torch.nn.functional.pad(padded_tag_ids_test, (0, max_length - padded_tag_ids_test.size(1)), value=0)
# Convert to tensor
input_ids_train = padded_input_ids_train.clone().detach()
tag_ids_train = padded_tag_ids_train.clone().detach()
input_ids_test = padded_input_ids_test.clone().detach()
tag_ids_test = padded_tag_ids_test.clone().detach()
"""
Mask torch.Tensor: A tensor containing the attention masks.
This tensor indicates which tokens are real and which are padding.
"""
attention_masks_train = torch.tensor([[1.0 if i != tokenizer.pad_token_id and i!= tokenizer.cls_token_id and i!= tokenizer.sep_token_id else 0 for i in ii] for ii in input_ids_train])
attention_masks_test = torch.tensor([[1.0 if i != tokenizer.pad_token_id and i!= tokenizer.cls_token_id and i!= tokenizer.sep_token_id else 0 for i in ii] for ii in input_ids_test])
# Convert data to PyTorch tensors
train_inputs = input_ids_train.clone().detach()
validation_inputs = input_ids_test.clone().detach()
train_labels = tag_ids_train.clone().detach()
validation_labels = tag_ids_test.clone().detach()
train_masks = attention_masks_train.clone().detach()
validation_masks = attention_masks_test.clone().detach()
# Create DataLoader for training data
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
# Create DataLoader for validation data
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)
|