Spaces:
Runtime error
Runtime error
Upload 2 files
Browse files- example/ex2_final.py +140 -0
- example/ex2_init.py +61 -0
example/ex2_final.py
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
import torch
|
| 5 |
+
import torch.nn as nn
|
| 6 |
+
import torch.optim as optim
|
| 7 |
+
from torch.utils.data import DataLoader, Dataset
|
| 8 |
+
from transformers import BertTokenizer, BertModel
|
| 9 |
+
|
| 10 |
+
# Define constants
|
| 11 |
+
DIMENSIONS = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
|
| 12 |
+
|
| 13 |
+
class EssayDataset(Dataset):
|
| 14 |
+
def __init__(self, texts, targets, tokenizer, max_len):
|
| 15 |
+
self.texts = texts
|
| 16 |
+
self.targets = targets
|
| 17 |
+
self.tokenizer = tokenizer
|
| 18 |
+
self.max_len = max_len
|
| 19 |
+
|
| 20 |
+
def __len__(self):
|
| 21 |
+
return len(self.texts)
|
| 22 |
+
|
| 23 |
+
def __getitem__(self, item):
|
| 24 |
+
text = self.texts[item]
|
| 25 |
+
target = self.targets[item]
|
| 26 |
+
|
| 27 |
+
encoding = self.tokenizer.encode_plus(
|
| 28 |
+
text,
|
| 29 |
+
add_special_tokens=True,
|
| 30 |
+
max_length=self.max_len,
|
| 31 |
+
return_token_type_ids=False,
|
| 32 |
+
padding='max_length',
|
| 33 |
+
return_attention_mask=True,
|
| 34 |
+
return_tensors='pt',
|
| 35 |
+
truncation=True
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
return {
|
| 39 |
+
'text': text,
|
| 40 |
+
'input_ids': encoding['input_ids'].flatten(),
|
| 41 |
+
'attention_mask': encoding['attention_mask'].flatten(),
|
| 42 |
+
'targets': torch.tensor(target, dtype=torch.float)
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
class EssayScoreRegressor(nn.Module):
|
| 46 |
+
def __init__(self, n_outputs):
|
| 47 |
+
super(EssayScoreRegressor, self).__init__()
|
| 48 |
+
self.bert = BertModel.from_pretrained('bert-base-uncased')
|
| 49 |
+
self.drop = nn.Dropout(p=0.3)
|
| 50 |
+
self.out = nn.Linear(self.bert.config.hidden_size, n_outputs)
|
| 51 |
+
|
| 52 |
+
def forward(self, input_ids, attention_mask):
|
| 53 |
+
pooled_output = self.bert(
|
| 54 |
+
input_ids=input_ids,
|
| 55 |
+
attention_mask=attention_mask
|
| 56 |
+
)['pooler_output']
|
| 57 |
+
output = self.drop(pooled_output)
|
| 58 |
+
return self.out(output)
|
| 59 |
+
|
| 60 |
+
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
|
| 61 |
+
model = model.train()
|
| 62 |
+
losses = []
|
| 63 |
+
|
| 64 |
+
for d in data_loader:
|
| 65 |
+
input_ids = d['input_ids'].to(device)
|
| 66 |
+
attention_mask = d['attention_mask'].to(device)
|
| 67 |
+
targets = d['targets'].to(device)
|
| 68 |
+
|
| 69 |
+
outputs = model(input_ids=input_ids, attention_mask=attention_mask)
|
| 70 |
+
loss = loss_fn(outputs, targets)
|
| 71 |
+
|
| 72 |
+
losses.append(loss.item())
|
| 73 |
+
|
| 74 |
+
loss.backward()
|
| 75 |
+
optimizer.step()
|
| 76 |
+
scheduler.step()
|
| 77 |
+
optimizer.zero_grad()
|
| 78 |
+
|
| 79 |
+
return np.mean(losses)
|
| 80 |
+
|
| 81 |
+
def train_model(train_data, val_data, tokenizer, model, optimizer, scheduler, device, epochs, batch_size, max_len):
|
| 82 |
+
train_dataset = EssayDataset(
|
| 83 |
+
texts=train_data['full_text'].to_numpy(),
|
| 84 |
+
targets=train_data[DIMENSIONS].to_numpy(),
|
| 85 |
+
tokenizer=tokenizer,
|
| 86 |
+
max_len=max_len
|
| 87 |
+
)
|
| 88 |
+
|
| 89 |
+
val_dataset = EssayDataset(
|
| 90 |
+
texts=val_data['full_text'].to_numpy(),
|
| 91 |
+
targets=val_data[DIMENSIONS].to_numpy(),
|
| 92 |
+
tokenizer=tokenizer,
|
| 93 |
+
max_len=max_len
|
| 94 |
+
)
|
| 95 |
+
|
| 96 |
+
train_data_loader = DataLoader(
|
| 97 |
+
train_dataset,
|
| 98 |
+
batch_size=batch_size,
|
| 99 |
+
shuffle=True
|
| 100 |
+
)
|
| 101 |
+
|
| 102 |
+
val_data_loader = DataLoader(
|
| 103 |
+
val_dataset,
|
| 104 |
+
batch_size=batch_size,
|
| 105 |
+
shuffle=False
|
| 106 |
+
)
|
| 107 |
+
|
| 108 |
+
loss_fn = nn.MSELoss().to(device)
|
| 109 |
+
|
| 110 |
+
for epoch in range(epochs):
|
| 111 |
+
print(f'Epoch {epoch + 1}/{epochs}')
|
| 112 |
+
print('-' * 10)
|
| 113 |
+
|
| 114 |
+
train_loss = train_epoch(
|
| 115 |
+
model,
|
| 116 |
+
train_data_loader,
|
| 117 |
+
loss_fn,
|
| 118 |
+
optimizer,
|
| 119 |
+
device,
|
| 120 |
+
scheduler,
|
| 121 |
+
len(train_dataset)
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
print(f'Train loss {train_loss}')
|
| 125 |
+
|
| 126 |
+
if __name__ == "__main__":
|
| 127 |
+
df = pd.read_csv('train.csv')
|
| 128 |
+
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
| 129 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 130 |
+
model = EssayScoreRegressor(n_outputs=len(DIMENSIONS))
|
| 131 |
+
model = model.to(device)
|
| 132 |
+
|
| 133 |
+
optimizer = optim.Adam(model.parameters(), lr=2e-5)
|
| 134 |
+
total_steps = len(df) // 16 * 5
|
| 135 |
+
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=total_steps, gamma=0.1)
|
| 136 |
+
|
| 137 |
+
train_data = df.sample(frac=0.8, random_state=42)
|
| 138 |
+
val_data = df.drop(train_data.index)
|
| 139 |
+
|
| 140 |
+
train_model(train_data, val_data, tokenizer, model, optimizer, scheduler, device, epochs=5, batch_size=16, max_len=160)
|
example/ex2_init.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
|
| 3 |
+
import numpy as np
|
| 4 |
+
import random
|
| 5 |
+
import torch
|
| 6 |
+
from sklearn.model_selection import train_test_split
|
| 7 |
+
|
| 8 |
+
DIMENSIONS = ["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]
|
| 9 |
+
SEED = 42
|
| 10 |
+
|
| 11 |
+
random.seed(SEED)
|
| 12 |
+
torch.manual_seed(SEED)
|
| 13 |
+
np.random.seed(SEED)
|
| 14 |
+
|
| 15 |
+
def compute_metrics_for_regression(y_test, y_test_pred):
|
| 16 |
+
metrics = {}
|
| 17 |
+
for task in DIMENSIONS:
|
| 18 |
+
targets_task = [t[DIMENSIONS.index(task)] for t in y_test]
|
| 19 |
+
pred_task = [l[DIMENSIONS.index(task)] for l in y_test_pred]
|
| 20 |
+
|
| 21 |
+
rmse = mean_squared_error(targets_task, pred_task, squared=False)
|
| 22 |
+
|
| 23 |
+
metrics[f"rmse_{task}"] = rmse
|
| 24 |
+
|
| 25 |
+
return metrics
|
| 26 |
+
|
| 27 |
+
def train_model(X_train, y_train, X_valid, y_valid):
|
| 28 |
+
model = None # Placeholder for model training
|
| 29 |
+
return model
|
| 30 |
+
|
| 31 |
+
def predict(model, X):
|
| 32 |
+
y_pred = np.random.rand(len(X), len(DIMENSIONS))
|
| 33 |
+
return y_pred
|
| 34 |
+
|
| 35 |
+
if __name__ == '__main__':
|
| 36 |
+
|
| 37 |
+
ellipse_df = pd.read_csv('train.csv',
|
| 38 |
+
header=0, names=['text_id', 'full_text', 'Cohesion', 'Syntax',
|
| 39 |
+
'Vocabulary', 'Phraseology','Grammar', 'Conventions'],
|
| 40 |
+
index_col='text_id')
|
| 41 |
+
ellipse_df = ellipse_df.dropna(axis=0)
|
| 42 |
+
|
| 43 |
+
data_df = ellipse_df
|
| 44 |
+
X = list(data_df.full_text.to_numpy())
|
| 45 |
+
y = np.array([data_df.drop(['full_text'], axis=1).iloc[i] for i in range(len(X))])
|
| 46 |
+
|
| 47 |
+
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.10, random_state=SEED)
|
| 48 |
+
|
| 49 |
+
model = train_model(X_train, y_train, X_valid, y_valid)
|
| 50 |
+
|
| 51 |
+
y_valid_pred = predict(model, X_valid)
|
| 52 |
+
metrics = compute_metrics_for_regression(y_valid, y_valid_pred)
|
| 53 |
+
print(metrics)
|
| 54 |
+
print("final MCRMSE on validation set: ", np.mean(list(metrics.values())))
|
| 55 |
+
|
| 56 |
+
submission_df = pd.read_csv('test.csv', header=0, names=['text_id', 'full_text'], index_col='text_id')
|
| 57 |
+
X_submission = list(submission_df.full_text.to_numpy())
|
| 58 |
+
y_submission = predict(model, X_submission)
|
| 59 |
+
submission_df = pd.DataFrame(y_submission, columns=DIMENSIONS)
|
| 60 |
+
submission_df.index = submission_df.index.rename('text_id')
|
| 61 |
+
submission_df.to_csv('submission.csv')
|