File size: 3,254 Bytes
85e3d20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import os
import pandas as pd
from argparse import ArgumentParser
from typing import List
import torch
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, InputExample, losses
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def load_data(dataset_dir: str, data_split: str, list_of_langs: List[str]) -> List[InputExample]:
    data_list = []
    for lang in list_of_langs:
        train_data_path = os.path.join(dataset_dir, lang, f"{lang}_{data_split}.csv")
        if not os.path.exists(train_data_path):
            print(f"{data_split} data for {lang} does not exist")
            continue

        df = pd.read_csv(train_data_path)
        scores = df["label"].tolist()
        scores = [float(score) for score in scores]
        sentence_1s = df["sentence1"].tolist()
        sentence_2s = df["sentence2"].tolist()

        for i in range(len(scores)):
            data_list.append(InputExample(texts=[sentence_1s[i], sentence_2s[i]], label=scores[i]))
    return data_list


dataset_dir= "data"
list_of_langs=["eng"]
train_examples = load_data(dataset_dir, "train", list_of_langs)
test_examples = load_data(dataset_dir, "test", list_of_langs)

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
test_dataloader = DataLoader(test_examples, shuffle=False, batch_size=16)

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

model = SentenceTransformer("sentence-transformers/LaBSE", device=device)
loss_function = losses.CosineSimilarityLoss(model=model)



model.fit(
    train_objectives=[(train_dataloader, loss_function)],
    epochs=10,
    warmup_steps=100,
    output_path="semrel_baselines/models/finetuned_esp_labse",
)


def test_model(test_examples):
    sentence_1s = [ex.texts[0] for ex in test_examples]
    sentence_2s = [ex.texts[1] for ex in test_examples]
    scores = [ex.label for ex in test_examples]

            
    # Calculate embeddings
    embeddings1 = model.encode(sentence_1s, convert_to_tensor=True)
    embeddings2 = model.encode(sentence_2s, convert_to_tensor=True)
    
    # Calculate cosine similarity
    cos_sim = cosine_similarity(embeddings1.cpu(), embeddings2.cpu())
    cos_sim_scores = [cos_sim[i, i] for i in range(len(cos_sim))]
    

    spearman_corr = np.corrcoef(scores, cos_sim_scores)[0, 1]
    return spearman_corr



train_corr = test_model(train_examples)
test_corr = test_model(test_examples)
print (f'Train Spearman correlation: {train_corr:.2f}%, Test Spearman correlation: {test_corr:.2f}%')

# Save the predictions to submission.csv

sentence_1s = [ex.texts[0] for ex in test_examples]
sentence_2s = [ex.texts[1] for ex in test_examples]
scores = [ex.label for ex in test_examples]

embeddings1 = model.encode(sentence_1s, convert_to_tensor=True)
embeddings2 = model.encode(sentence_2s, convert_to_tensor=True)

cos_sim = cosine_similarity(embeddings1.cpu(), embeddings2.cpu())
cos_sim_scores = [cos_sim[i, i] for i in range(len(cos_sim))]


results_df = pd.DataFrame({
    "sentence1": sentence_1s,
    "sentence2": sentence_2s,
    "label": cos_sim_scores
})
result_path = "submission.csv"
results_df.to_csv(result_path, index=False)
print(f"Results saved to {result_path}")