Spaces:
Running
on
Zero
Running
on
Zero
update app.py
Browse files
app.py
CHANGED
@@ -1,25 +1,23 @@
|
|
1 |
import gradio as gr
|
2 |
import spaces
|
3 |
import torch
|
|
|
|
|
4 |
from datasets import load_dataset, concatenate_datasets
|
5 |
from sentence_transformers import SentenceTransformer
|
6 |
from sentence_transformers.evaluation import InformationRetrievalEvaluator, SequentialEvaluator
|
7 |
from sentence_transformers.util import cos_sim
|
8 |
-
import pandas as pd
|
9 |
|
10 |
# Check for GPU support and configure appropriately
|
11 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
12 |
-
zero = torch.Tensor([0]).to(device)
|
13 |
print(f"Device being used: {zero.device}")
|
14 |
|
15 |
-
@spaces.GPU
|
16 |
def evaluate_model(model_id):
|
17 |
-
# Load the model on the appropriate device
|
18 |
model = SentenceTransformer(model_id, device=device)
|
19 |
-
|
20 |
-
# Define the evaluation parameters
|
21 |
matryoshka_dimensions = [768, 512, 256, 128, 64]
|
22 |
-
|
23 |
# Prepare datasets
|
24 |
datasets_info = [
|
25 |
{
|
@@ -28,38 +26,43 @@ def evaluate_model(model_id):
|
|
28 |
"split": "train",
|
29 |
"size": 7000,
|
30 |
"columns": ("question", "context"),
|
31 |
-
"sample_size":
|
32 |
},
|
33 |
{
|
34 |
-
"name": "MLQA Arabic",
|
35 |
"dataset_id": "google/xtreme",
|
36 |
-
"subset": "MLQA.ar.ar",
|
37 |
"split": "validation",
|
38 |
"size": 500,
|
39 |
"columns": ("question", "context"),
|
40 |
-
"sample_size":
|
41 |
},
|
42 |
{
|
43 |
-
"name": "
|
44 |
-
"dataset_id": "
|
45 |
"split": "train",
|
46 |
"size": None,
|
47 |
"columns": ("question", "context"),
|
48 |
-
"sample_size":
|
|
|
49 |
}
|
50 |
]
|
51 |
|
52 |
evaluation_results = []
|
53 |
-
|
|
|
54 |
for dataset_info in datasets_info:
|
55 |
# Load the dataset with subset if available
|
56 |
if "subset" in dataset_info:
|
57 |
dataset = load_dataset(dataset_info["dataset_id"], dataset_info["subset"], split=dataset_info["split"])
|
58 |
else:
|
59 |
dataset = load_dataset(dataset_info["dataset_id"], split=dataset_info["split"])
|
60 |
-
|
61 |
-
#
|
62 |
-
|
|
|
|
|
|
|
63 |
|
64 |
# Rename columns
|
65 |
dataset = dataset.rename_column(dataset_info["columns"][0], "anchor")
|
@@ -77,7 +80,6 @@ def evaluate_model(model_id):
|
|
77 |
relevant_docs = {q_id: [q_id] for q_id in queries}
|
78 |
|
79 |
matryoshka_evaluators = []
|
80 |
-
# Iterate over the different dimensions
|
81 |
for dim in matryoshka_dimensions:
|
82 |
ir_evaluator = InformationRetrievalEvaluator(
|
83 |
queries=queries,
|
@@ -88,14 +90,11 @@ def evaluate_model(model_id):
|
|
88 |
score_functions={"cosine": cos_sim},
|
89 |
)
|
90 |
matryoshka_evaluators.append(ir_evaluator)
|
91 |
-
|
92 |
-
# Create a sequential evaluator
|
93 |
evaluator = SequentialEvaluator(matryoshka_evaluators)
|
94 |
-
|
95 |
-
# Evaluate the model
|
96 |
results = evaluator(model)
|
97 |
|
98 |
-
|
99 |
for dim in matryoshka_dimensions:
|
100 |
key = f"dim_{dim}_cosine_ndcg@10"
|
101 |
score = results[key] if key in results else None
|
@@ -104,26 +103,52 @@ def evaluate_model(model_id):
|
|
104 |
"Dimension": dim,
|
105 |
"Score": score
|
106 |
})
|
107 |
-
|
|
|
|
|
|
|
|
|
108 |
# Convert results to DataFrame for display
|
109 |
result_df = pd.DataFrame(evaluation_results)
|
110 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
|
112 |
# Define the Gradio interface
|
113 |
def display_results(model_name):
|
114 |
-
|
115 |
-
result_df
|
116 |
-
return result_df
|
117 |
|
118 |
-
# Create the Gradio interface
|
119 |
demo = gr.Interface(
|
120 |
fn=display_results,
|
121 |
-
inputs=gr.Textbox(label="Model ID"),
|
122 |
-
outputs=
|
123 |
-
|
124 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
)
|
126 |
|
127 |
-
# Launch the Gradio app
|
128 |
if __name__ == "__main__":
|
129 |
demo.launch()
|
|
|
1 |
import gradio as gr
|
2 |
import spaces
|
3 |
import torch
|
4 |
+
import pandas as pd
|
5 |
+
import matplotlib.pyplot as plt
|
6 |
from datasets import load_dataset, concatenate_datasets
|
7 |
from sentence_transformers import SentenceTransformer
|
8 |
from sentence_transformers.evaluation import InformationRetrievalEvaluator, SequentialEvaluator
|
9 |
from sentence_transformers.util import cos_sim
|
|
|
10 |
|
11 |
# Check for GPU support and configure appropriately
|
12 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
13 |
+
zero = torch.Tensor([0]).to(device)
|
14 |
print(f"Device being used: {zero.device}")
|
15 |
|
16 |
+
@spaces.GPU
|
17 |
def evaluate_model(model_id):
|
|
|
18 |
model = SentenceTransformer(model_id, device=device)
|
|
|
|
|
19 |
matryoshka_dimensions = [768, 512, 256, 128, 64]
|
20 |
+
|
21 |
# Prepare datasets
|
22 |
datasets_info = [
|
23 |
{
|
|
|
26 |
"split": "train",
|
27 |
"size": 7000,
|
28 |
"columns": ("question", "context"),
|
29 |
+
"sample_size": 500
|
30 |
},
|
31 |
{
|
32 |
+
"name": "MLQA Arabic (Long Context)",
|
33 |
"dataset_id": "google/xtreme",
|
34 |
+
"subset": "MLQA.ar.ar",
|
35 |
"split": "validation",
|
36 |
"size": 500,
|
37 |
"columns": ("question", "context"),
|
38 |
+
"sample_size": 500
|
39 |
},
|
40 |
{
|
41 |
+
"name": "ARCD (Short Context)",
|
42 |
+
"dataset_id": "hsseinmz/arcd",
|
43 |
"split": "train",
|
44 |
"size": None,
|
45 |
"columns": ("question", "context"),
|
46 |
+
"sample_size": 500,
|
47 |
+
"last_rows": True # Flag to take the last 500 rows
|
48 |
}
|
49 |
]
|
50 |
|
51 |
evaluation_results = []
|
52 |
+
scores_by_dataset = {}
|
53 |
+
|
54 |
for dataset_info in datasets_info:
|
55 |
# Load the dataset with subset if available
|
56 |
if "subset" in dataset_info:
|
57 |
dataset = load_dataset(dataset_info["dataset_id"], dataset_info["subset"], split=dataset_info["split"])
|
58 |
else:
|
59 |
dataset = load_dataset(dataset_info["dataset_id"], split=dataset_info["split"])
|
60 |
+
|
61 |
+
# Take last 500 rows if specified
|
62 |
+
if dataset_info.get("last_rows"):
|
63 |
+
dataset = dataset.select(range(len(dataset) - dataset_info["sample_size"], len(dataset)))
|
64 |
+
else:
|
65 |
+
dataset = dataset.select(range(min(dataset_info["sample_size"], len(dataset))))
|
66 |
|
67 |
# Rename columns
|
68 |
dataset = dataset.rename_column(dataset_info["columns"][0], "anchor")
|
|
|
80 |
relevant_docs = {q_id: [q_id] for q_id in queries}
|
81 |
|
82 |
matryoshka_evaluators = []
|
|
|
83 |
for dim in matryoshka_dimensions:
|
84 |
ir_evaluator = InformationRetrievalEvaluator(
|
85 |
queries=queries,
|
|
|
90 |
score_functions={"cosine": cos_sim},
|
91 |
)
|
92 |
matryoshka_evaluators.append(ir_evaluator)
|
93 |
+
|
|
|
94 |
evaluator = SequentialEvaluator(matryoshka_evaluators)
|
|
|
|
|
95 |
results = evaluator(model)
|
96 |
|
97 |
+
scores = []
|
98 |
for dim in matryoshka_dimensions:
|
99 |
key = f"dim_{dim}_cosine_ndcg@10"
|
100 |
score = results[key] if key in results else None
|
|
|
103 |
"Dimension": dim,
|
104 |
"Score": score
|
105 |
})
|
106 |
+
scores.append(score)
|
107 |
+
|
108 |
+
# Store scores by dataset for bar chart creation
|
109 |
+
scores_by_dataset[dataset_info["name"]] = scores
|
110 |
+
|
111 |
# Convert results to DataFrame for display
|
112 |
result_df = pd.DataFrame(evaluation_results)
|
113 |
+
|
114 |
+
# Generate bar charts for each dataset
|
115 |
+
charts = []
|
116 |
+
for dataset_name, scores in scores_by_dataset.items():
|
117 |
+
fig, ax = plt.subplots()
|
118 |
+
ax.bar([str(dim) for dim in matryoshka_dimensions], scores)
|
119 |
+
ax.set_title(f"{dataset_name} Evaluation Scores", fontsize=16, color='darkblue')
|
120 |
+
ax.set_xlabel("Embedding Dimension", fontsize=12)
|
121 |
+
ax.set_ylabel("NDCG@10 Score", fontsize=12)
|
122 |
+
plt.tight_layout()
|
123 |
+
charts.append(fig)
|
124 |
+
|
125 |
+
return result_df, charts
|
126 |
|
127 |
# Define the Gradio interface
|
128 |
def display_results(model_name):
|
129 |
+
result_df, charts = evaluate_model(model_name)
|
130 |
+
return result_df, charts[0], charts[1], charts[2]
|
|
|
131 |
|
|
|
132 |
demo = gr.Interface(
|
133 |
fn=display_results,
|
134 |
+
inputs=gr.Textbox(label="Enter a Hugging Face Model ID", placeholder="e.g., sentence-transformers/all-MiniLM-L6-v2"),
|
135 |
+
outputs=[
|
136 |
+
gr.Dataframe(label="Evaluation Results"),
|
137 |
+
gr.Plot(label="Arabic Financial Dataset (Financial Evaluation)"),
|
138 |
+
gr.Plot(label="MLQA Arabic (Long Context Evaluation)"),
|
139 |
+
gr.Plot(label="ARCD (Short Context Evaluation)")
|
140 |
+
],
|
141 |
+
title="Sentence Transformer Evaluation Dashboard",
|
142 |
+
description=(
|
143 |
+
"Evaluate your Sentence Transformer model across three datasets:\n"
|
144 |
+
"- **Arabic Financial Dataset** for financial data evaluation.\n"
|
145 |
+
"- **MLQA Arabic** for long context question-answer evaluation.\n"
|
146 |
+
"- **ARCD** for short context question-answer evaluation.\n"
|
147 |
+
"Results are displayed in a table and visualized with bar charts for each dataset."
|
148 |
+
),
|
149 |
+
theme="default",
|
150 |
+
live=False
|
151 |
)
|
152 |
|
|
|
153 |
if __name__ == "__main__":
|
154 |
demo.launch()
|