import transformers import torch from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler from transformers import DistilBertTokenizer, DistilBertModel import logging logging.basicConfig(level=logging.ERROR) import torch.nn as nn from torch.nn import functional as F import torch.optim as optim import pandas as pd import numpy as np # Điều chỉnh các tham số device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') MAX_LEN = 100 TRAIN_BATCH_SIZE = 4 VALID_BATCH_SIZE = 4 EPOCHS = 1 LEARNING_RATE = 1e-05 tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', truncation=True, do_lower_case=True) # Tạo dataframe train_df_DB = pd.read_csv('./data/train.csv') train_df_DB['label'] = train_df_DB.iloc[:, 1:].values.tolist() test_df_DB = pd.read_csv('./data/test.csv') test_df_DB = test_df_DB[['text', 'preprocess_sentence', 'label']] test_df_DB['label'] = test_df_DB.iloc[:, 2:].values.tolist() # Tạo class class BinaryLabel(Dataset): def __init__(self, dataframe, tokenizer, max_len): self.tokenizer = tokenizer self.data = dataframe self.text = dataframe.text self.targets = self.data.label self.max_len = max_len def __len__(self): return len(self.text) def __getitem__(self, index): text = str(self.text[index]) text = " ".join(text.split()) inputs = self.tokenizer.encode_plus( text, None, add_special_tokens=True, max_length=self.max_len, pad_to_max_length=True, return_token_type_ids=True ) ids = inputs['input_ids'] mask = inputs['attention_mask'] token_type_ids = inputs["token_type_ids"] return { 'ids': torch.tensor(ids, dtype=torch.long), 'mask': torch.tensor(mask, dtype=torch.long), 'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long), 'targets': torch.tensor(self.targets[index], dtype=torch.float) } train_params = {'batch_size': TRAIN_BATCH_SIZE, 'shuffle': True, 'num_workers': 0 } test_params = {'batch_size': VALID_BATCH_SIZE, 'shuffle': True, 'num_workers': 0 } training_set = BinaryLabel(train_df_DB, tokenizer, MAX_LEN) testing_set = BinaryLabel(test_df_DB, tokenizer, MAX_LEN) training_loader = DataLoader(training_set, **train_params) testing_loader = DataLoader(testing_set, **test_params) # Create model class DistilBERTClass(torch.nn.Module): def __init__(self): super(DistilBERTClass, self).__init__() self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased") self.pre_classifier = torch.nn.Linear(768, 768) self.dropout = torch.nn.Dropout(0.1) self.classifier = torch.nn.Linear(768, 1) def forward(self, input_ids, attention_mask, token_type_ids): output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask) hidden_state = output_1[0] pooler = hidden_state[:, 0] pooler = self.pre_classifier(pooler) pooler = torch.nn.ReLU()(pooler) pooler = self.dropout(pooler) output = self.classifier(pooler) return output model_DB = DistilBERTClass() model_DB.to(device) # Validation function def validation(testing_loader): model_DB.eval() fin_targets=[] fin_outputs=[] with torch.no_grad(): for _, data in tqdm(enumerate(testing_loader, 0)): ids = data['ids'].to(device, dtype = torch.long) mask = data['mask'].to(device, dtype = torch.long) token_type_ids = data['token_type_ids'].to(device, dtype = torch.long) targets = data['targets'].to(device, dtype = torch.float) outputs = model_DB(ids, mask, token_type_ids) fin_targets.extend(targets.cpu().detach().numpy().tolist()) fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist()) return fin_outputs, fin_targets # Train function def train(epoch): model.train() for _,data in tqdm(enumerate(training_loader, 0)): ids = data['ids'].to(device, dtype = torch.long) mask = data['mask'].to(device, dtype = torch.long) token_type_ids = data['token_type_ids'].to(device, dtype = torch.long) targets = data['targets'].to(device, dtype = torch.float) outputs = model(ids, mask, token_type_ids) optimizer.zero_grad() loss = loss_fn(outputs, targets) if _%50==0: print(f'Epoch: {epoch}, Loss: {loss.item()}') if loss.item() < 0.07: print(f'Breaking the loop as loss is below 0.07: {loss.item()}') break loss.backward() optimizer.step() def loss_fn(outputs, targets): return torch.nn.BCEWithLogitsLoss()(outputs, targets) optimizer = torch.optim.Adam(params = model_DB.parameters(), lr=LEARNING_RATE) loaded_model_path = './model_DB_1.pt' model_DB.load_state_dict(torch.load(loaded_model_path, map_location=torch.device('cpu')))