Spaces:
Runtime error
Runtime error
import base64 | |
from huggingface_hub import hf_hub_download | |
import fasttext | |
import os | |
import json | |
import pandas as pd | |
from sklearn.metrics import ( | |
precision_score, | |
recall_score, | |
f1_score, | |
confusion_matrix, | |
balanced_accuracy_score, | |
matthews_corrcoef | |
) | |
import numpy as np | |
from datasets import load_dataset | |
# Constants | |
MODEL_REPO = "atlasia/Sfaya-Moroccan-Darija-vs-All" | |
BIN_FILENAME = "model_multi_v3_2fpr.bin" | |
BINARY_LEADERBOARD_FILE = "darija_leaderboard_binary.json" | |
MULTILINGUAL_LEADERBOARD_FILE = "darija_leaderboard_multilingual.json" | |
DATA_PATH = "atlasia/Arabic-LID-Leaderboard" | |
target_label = "Morocco" | |
is_binary = False | |
# Load test dataset | |
test_dataset = load_dataset(DATA_PATH, split='test') | |
# Supported dialects | |
all_target_languages = list(test_dataset.unique("dialect")) | |
supported_dialects = all_target_languages + ['All'] | |
languages_to_display_one_vs_all = all_target_languages # everything except All | |
metrics = [ | |
'f1_score', | |
'precision', | |
'recall', | |
'specificity', | |
'false_positive_rate', | |
'false_negative_rate', | |
'negative_predictive_value', | |
'n_test_samples', | |
] | |
default_metrics = [ | |
'f1_score', | |
'precision', | |
'recall', | |
'false_positive_rate', | |
'false_negative_rate' | |
] | |
# default language to display in one-vs-all leaderboard | |
default_languages = [ | |
'Morocco', | |
'MSA', | |
'Egypt', | |
'Algeria', | |
'Tunisia', | |
'Levantine', | |
] | |
language_mapping_dict = { | |
'ace_Arab': 'Acehnese', | |
'acm_Arab': 'Mesopotamia', # 'Gilit Mesopotamian' | |
'aeb_Arab': 'Tunisia', | |
'ajp_Arab': 'Levantine', # 'South Levantine' | |
'apc_Arab': 'Levantine', | |
'arb_Arab': 'MSA', | |
'arq_Arab': 'Algeria', | |
'ars_Arab': 'Saudi', # Najdi is primarily Saudi Arabian | |
'ary_Arab': 'Morocco', | |
'arz_Arab': 'Egypt', | |
'ayp_Arab': 'Mesopotamia', # 'North Mesopotamian' | |
'azb_Arab': 'Azerbaijan', # South Azerbaijani pertains to this region | |
'bcc_Arab': 'Balochistan', # Southern Balochi is from Balochistan | |
'bjn_Arab': 'Indonesia', # Banjar is spoken in Indonesia | |
'brh_Arab': 'Pakistan', # Brahui is spoken in Pakistan | |
'ckb_Arab': 'Kurdistan', # Central Kurdish is mainly in Iraq | |
'fuv_Arab': 'Nigeria', # Hausa States Fulfulde | |
'glk_Arab': 'Iran', # Gilaki is spoken in Iran | |
'hac_Arab': 'Iran', # Gurani is also primarily spoken in Iran | |
'kas_Arab': 'Kashmir', | |
'knc_Arab': 'Nigeria', # Central Kanuri is in Nigeria | |
'lki_Arab': 'Iran', # Laki is from Iran | |
'lrc_Arab': 'Iran', # Northern Luri is from Iran | |
'min_Arab': 'Indonesia', # Minangkabau is spoken in Indonesia | |
'mzn_Arab': 'Iran', # Mazanderani is spoken in Iran | |
'ota_Arab': 'Turkey', # Ottoman Turkish | |
'pbt_Arab': 'Afghanistan', # Southern Pashto | |
'pnb_Arab': 'Pakistan', # Western Panjabi | |
'sdh_Arab': 'Iraq', # Southern Kurdish | |
'shu_Arab': 'Chad', # Chadian Arabic | |
'skr_Arab': 'Pakistan', # Saraiki | |
'snd_Arab': 'Pakistan', # Sindhi | |
'sus_Arab': 'Guinea', # Susu | |
'tuk_Arab': 'Turkmenistan', # Turkmen | |
'uig_Arab': 'Uighur (China)', # Uighur | |
'urd_Arab': 'Pakistan', # Urdu | |
'uzs_Arab': 'Uzbekistan', # Southern Uzbek | |
'zsm_Arab': 'Malaysia' # Standard Malay | |
} | |
def predict_label(text, model, language_mapping_dict, use_mapping=False): | |
# Remove any newline characters and strip whitespace | |
text = str(text).strip().replace('\n', ' ') | |
if text == '': | |
return 'Other' | |
try: | |
# Get top prediction | |
prediction = model.predict(text, 1) | |
# Extract label and remove __label__ prefix | |
label = prediction[0][0].replace('__label__', '') | |
# Extract confidence score | |
confidence = prediction[1][0] | |
# map label to language using language_mapping_dict | |
if use_mapping: | |
label = language_mapping_dict.get(label, 'Other') | |
return label | |
except Exception as e: | |
print(f"Error processing text: {text}") | |
print(f"Exception: {e}") | |
return {'prediction_label': 'Error', 'prediction_confidence': 0.0} | |
def compute_classification_metrics(test_dataset): | |
""" | |
Compute comprehensive classification metrics for each class. | |
Args: | |
data (pd.DataFrame): DataFrame containing 'dialect' as true labels and 'preds' as predicted labels. | |
Returns: | |
pd.DataFrame: DataFrame with detailed metrics for each class. | |
""" | |
# transform the dataset into a DataFrame | |
data = pd.DataFrame(test_dataset) | |
# Extract true labels and predictions | |
true_labels = list(data['dialect']) | |
predicted_labels = list(data['preds']) | |
# Handle all unique labels | |
labels = sorted(list(set(true_labels + predicted_labels))) | |
label_to_index = {label: index for index, label in enumerate(labels)} | |
# Convert labels to indices | |
true_indices = [label_to_index[label] for label in true_labels] | |
pred_indices = [label_to_index[label] for label in predicted_labels] | |
# Compute basic metrics | |
f1_scores = f1_score(true_indices, pred_indices, average=None, labels=range(len(labels))) | |
precision_scores = precision_score(true_indices, pred_indices, average=None, labels=range(len(labels))) | |
recall_scores = recall_score(true_indices, pred_indices, average=None, labels=range(len(labels))) | |
# Compute confusion matrix | |
conf_mat = confusion_matrix(true_indices, pred_indices, labels=range(len(labels))) | |
# Calculate various metrics per class | |
FP = conf_mat.sum(axis=0) - np.diag(conf_mat) # False Positives | |
FN = conf_mat.sum(axis=1) - np.diag(conf_mat) # False Negatives | |
TP = np.diag(conf_mat) # True Positives | |
TN = conf_mat.sum() - (FP + FN + TP) # True Negatives | |
# Calculate sample counts per class | |
samples_per_class = np.bincount(true_indices, minlength=len(labels)) | |
# Calculate additional metrics | |
with np.errstate(divide='ignore', invalid='ignore'): | |
fp_rate = FP / (FP + TN) # False Positive Rate | |
fn_rate = FN / (FN + TP) # False Negative Rate | |
specificity = TN / (TN + FP) # True Negative Rate | |
npv = TN / (TN + FN) # Negative Predictive Value | |
# Replace NaN/inf with 0 | |
metrics = [fp_rate, fn_rate, specificity, npv] | |
metrics = [np.nan_to_num(m, nan=0.0, posinf=0.0, neginf=0.0) for m in metrics] | |
fp_rate, fn_rate, specificity, npv = metrics | |
# Calculate overall metrics | |
balanced_acc = balanced_accuracy_score(true_indices, pred_indices) | |
mcc = matthews_corrcoef(true_indices, pred_indices) | |
# Compile results into a DataFrame | |
result_df = pd.DataFrame({ | |
'country': labels, | |
'samples': samples_per_class, | |
'f1_score': f1_scores, | |
'precision': precision_scores, | |
'recall': recall_scores, | |
'specificity': specificity, | |
'false_positive_rate': fp_rate, | |
'false_negative_rate': fn_rate, | |
'true_positives': TP, | |
'false_positives': FP, | |
'true_negatives': TN, | |
'false_negatives': FN, | |
'negative_predictive_value': npv | |
}) | |
# Sort by number of samples (descending) | |
result_df = result_df.sort_values('samples', ascending=False) | |
# Calculate and add summary metrics | |
summary_metrics = { | |
'macro_f1': f1_score(true_indices, pred_indices, average='macro'), | |
'weighted_f1': f1_score(true_indices, pred_indices, average='weighted'), | |
'micro_f1': f1_score(true_indices, pred_indices, average='micro'), | |
'balanced_accuracy': balanced_acc, | |
'matthews_correlation': mcc | |
} | |
# Format all numeric columns to 4 decimal places | |
numeric_cols = result_df.select_dtypes(include=[np.number]).columns | |
result_df[numeric_cols] = result_df[numeric_cols].round(4) | |
print(f'result_df: {result_df}') | |
return result_df, summary_metrics | |
def make_binary(dialect, target): | |
if dialect != target: | |
return 'Other' | |
return target | |
def run_eval_one_vs_all(model, data_test, TARGET_LANG='Morocco', language_mapping_dict=None, use_mapping=False): | |
# Predict labels using the model | |
print(f"[INFO] Running predictions...") | |
data_test['preds'] = data_test['text'].apply(lambda text: predict_label(text, model, language_mapping_dict, use_mapping=use_mapping)) | |
# map to binary | |
df_test_preds = data_test.copy() | |
df_test_preds.loc[df_test_preds['dialect'] == TARGET_LANG, 'dialect'] = TARGET_LANG | |
df_test_preds.loc[df_test_preds['dialect'] != TARGET_LANG, 'dialect'] = 'Other' | |
# compute the fpr per dialect | |
dialect_counts = data_test.groupby('dialect')['dialect'].count().reset_index(name='size') | |
result_df = pd.merge(dialect_counts, data_test, on='dialect') | |
result_df = result_df.groupby(['dialect', 'size', 'preds'])['preds'].count()/result_df.groupby(['dialect', 'size'])['preds'].count() | |
result_df.sort_index(ascending=False, level='size', inplace=True) | |
# group by dialect and get the false positive rate | |
out = result_df.copy() | |
out.name = 'false_positive_rate' | |
out = out.reset_index() | |
out = out[out['preds']==TARGET_LANG].drop(columns=['preds', 'size']) | |
print(f'out for TARGET_LANG={TARGET_LANG} \n: {out}') | |
return out | |
def update_darija_one_vs_all_leaderboard(result_df, model_name, target_lang, BINARY_LEADERBOARD_FILE="darija_leaderboard_binary.json"): | |
try: | |
with open(BINARY_LEADERBOARD_FILE, "r") as f: | |
data = json.load(f) | |
except FileNotFoundError: | |
data = [] | |
# Process the results for each dialect/country | |
for _, row in result_df.iterrows(): | |
dialect = row['dialect'] | |
# Skip 'Other' class, it is considered as the null space | |
if dialect == 'Other': | |
continue | |
# Find existing target_lang entry or create a new one | |
target_entry = next((item for item in data if target_lang in item), None) | |
if target_entry is None: | |
target_entry = {target_lang: {}} | |
data.append(target_entry) | |
# Get the country-specific data for this target language | |
country_data = target_entry[target_lang] | |
# Initialize the dialect/country entry if it doesn't exist | |
if dialect not in country_data: | |
country_data[dialect] = {} | |
# Update the model metrics under the model name for the given dialect | |
country_data[dialect][model_name] = float(row['false_positive_rate']) | |
# # Add the number of test samples, if not already present | |
# if "n_test_samples" not in country_data[dialect]: | |
# country_data[dialect]["n_test_samples"] = int(row['size']) | |
# Save updated leaderboard data | |
with open(BINARY_LEADERBOARD_FILE, "w") as f: | |
json.dump(data, f, indent=4) | |
def handle_evaluation(model_path, model_path_bin, use_mapping=False): | |
# download model and get the model path | |
model_path = hf_hub_download(repo_id=model_path, filename=model_path_bin, cache_dir=None) | |
# Load the trained model | |
print(f"[INFO] Loading model from Path: {model_path}, using version {model_path_bin}...") | |
model = fasttext.load_model(model_path) | |
# Load the evaluation dataset | |
print(f"[INFO] Loading evaluation dataset from Path: {DATA_PATH}...") | |
eval_dataset = load_dataset(DATA_PATH, split='test') | |
# Transform to pandas DataFrame | |
print(f"[INFO] Converting evaluation dataset to Pandas DataFrame...") | |
df_eval = pd.DataFrame(eval_dataset) | |
# run the evaluation | |
result_df, _ = run_eval(model, df_eval, language_mapping_dict, use_mapping=use_mapping) | |
# set the model name | |
model_name = model_path + '/' + model_path_bin | |
# update the multilingual leaderboard | |
update_darija_multilingual_leaderboard(result_df, model_name, MULTILINGUAL_LEADERBOARD_FILE) | |
# # TODO | |
for target_lang in all_target_languages: | |
result_df_one_vs_all =run_eval_one_vs_all(model, df_eval, TARGET_LANG=target_lang, language_mapping_dict=language_mapping_dict, use_mapping=use_mapping) | |
update_darija_one_vs_all_leaderboard(result_df_one_vs_all, model_name, target_lang, BINARY_LEADERBOARD_FILE) | |
# load the updated leaderboard tables | |
df_multilingual = load_leaderboard_multilingual() | |
df_one_vs_all = load_leaderboard_one_vs_all() | |
status_message = "**Evaluation now ended! 🤗**" | |
return create_leaderboard_display_multilingual(df_multilingual, target_label, default_metrics), status_message | |
def run_eval(model, df_eval, language_mapping_dict=None, use_mapping=False): | |
"""Run evaluation on a dataset and compute metrics. | |
Args: | |
model: The model to evaluate. | |
DATA_PATH (str): Path to the dataset. | |
is_binary (bool): If True, evaluate as binary classification. | |
If False, evaluate as multi-class classification. | |
target_label (str): The target class label in binary mode. | |
Returns: | |
pd.DataFrame: A DataFrame containing evaluation metrics. | |
""" | |
# Predict labels using the model | |
print(f"[INFO] Running predictions...") | |
df_eval['preds'] = df_eval['text'].apply(lambda text: predict_label(text, model, language_mapping_dict, use_mapping=use_mapping)) | |
# now drop the columns that are not needed, i.e. 'text' | |
df_eval = df_eval.drop(columns=['text', 'metadata', 'dataset_source']) | |
# Compute evaluation metrics | |
print(f"[INFO] Computing metrics...") | |
result_df, _ = compute_classification_metrics(df_eval) | |
# update_darija_multilingual_leaderboard(result_df, model_path, MULTILINGUAL_LEADERBOARD_FILE) | |
return result_df, df_eval | |
def process_results_file(file, uploaded_model_name, base_path_save="./atlasia/submissions/", default_language='Morocco'): | |
try: | |
if file is None: | |
return "Please upload a file." | |
# Clean the model name to be safe for file paths | |
uploaded_model_name = uploaded_model_name.strip().replace(" ", "_") | |
print(f"[INFO] uploaded_model_name: {uploaded_model_name}") | |
# Create the directory for saving submissions | |
path_saving = os.path.join(base_path_save, uploaded_model_name) | |
os.makedirs(path_saving, exist_ok=True) | |
# Define the full path to save the file | |
saved_file_path = os.path.join(path_saving, 'submission.csv') | |
# Read the uploaded file as DataFrame | |
print(f"[INFO] Loading results...") | |
df_eval = pd.read_csv(file.name) | |
# Save the DataFrame | |
print(f"[INFO] Saving the file locally in: {saved_file_path}") | |
df_eval.to_csv(saved_file_path, index=False) | |
except Exception as e: | |
return f"Error processing file: {str(e)}" | |
# Compute evaluation metrics | |
print(f"[INFO] Computing metrics...") | |
result_df, _ = compute_classification_metrics(df_eval) | |
# Update the leaderboards | |
update_darija_multilingual_leaderboard(result_df, uploaded_model_name, MULTILINGUAL_LEADERBOARD_FILE) | |
# TODO: implement this ove_vs_all differently for people only submitting csv file. They need to submit two files, one for multi-lang and the other for one-vs-all | |
# result_df_one_vs_all = run_eval_one_vs_all(...) | |
# update_darija_one_vs_all_leaderboard(...) | |
# update the leaderboard table | |
df = load_leaderboard_multilingual() | |
return create_leaderboard_display_multilingual(df, default_language, default_metrics) | |
def update_darija_multilingual_leaderboard(result_df, model_name, MULTILINGUAL_LEADERBOARD_FILE="darija_leaderboard_multilingual.json"): | |
# Load leaderboard data | |
current_dir = os.path.dirname(os.path.abspath(__file__)) | |
MULTILINGUAL_LEADERBOARD_FILE = os.path.join(current_dir, MULTILINGUAL_LEADERBOARD_FILE) | |
try: | |
with open(MULTILINGUAL_LEADERBOARD_FILE, "r") as f: | |
data = json.load(f) | |
except FileNotFoundError: | |
data = [] | |
# Process the results for each dialect/country | |
for _, row in result_df.iterrows(): | |
country = row['country'] | |
# skip 'Other' class, it is considered as the null space | |
if country == 'Other': | |
continue | |
# Create metrics dictionary directly | |
metrics = { | |
'f1_score': float(row['f1_score']), | |
'precision': float(row['precision']), | |
'recall': float(row['recall']), | |
'specificity': float(row['specificity']), | |
'false_positive_rate': float(row['false_positive_rate']), | |
'false_negative_rate': float(row['false_negative_rate']), | |
'negative_predictive_value': float(row['negative_predictive_value']), | |
'n_test_samples': int(row['samples']) | |
} | |
# Find existing country entry or create new one | |
country_entry = next((item for item in data if country in item), None) | |
if country_entry is None: | |
country_entry = {country: {}} | |
data.append(country_entry) | |
# Update the model metrics directly under the model name | |
if country not in country_entry: | |
country_entry[country] = {} | |
country_entry[country][model_name] = metrics | |
# Save updated leaderboard data | |
with open(MULTILINGUAL_LEADERBOARD_FILE, "w") as f: | |
json.dump(data, f, indent=4) | |
def load_leaderboard_one_vs_all(BINARY_LEADERBOARD_FILE="darija_leaderboard_binary.json"): | |
current_dir = os.path.dirname(os.path.abspath(__file__)) | |
BINARY_LEADERBOARD_FILE = os.path.join(current_dir, BINARY_LEADERBOARD_FILE) | |
with open(BINARY_LEADERBOARD_FILE, "r") as f: | |
data = json.load(f) | |
# Initialize lists to store the flattened data | |
rows = [] | |
# Process each target language's data | |
for leaderboard_data in data: | |
for target_language, results in leaderboard_data.items(): | |
for language, models in results.items(): | |
for model_name, false_positive_rate in models.items(): | |
row = { | |
'target_language': target_language, | |
'language': language, | |
'model': model_name, | |
'false_positive_rate': false_positive_rate, | |
} | |
# Add all metrics to the row | |
rows.append(row) | |
# Convert to DataFrame | |
df = pd.DataFrame(rows) | |
# Pivot the DataFrame to create the desired structure: all languages in columns and models in rows, and each (model, target_language, language) = false_positive_rate | |
df_pivot = df.pivot(index=['model', 'target_language'], columns='language', values='false_positive_rate').reset_index() | |
# print(f'df_pivot \n: {df_pivot}') | |
return df_pivot | |
def load_leaderboard_multilingual(MULTILINGUAL_LEADERBOARD_FILE="darija_leaderboard_multilingual.json"): | |
current_dir = os.path.dirname(os.path.abspath(__file__)) | |
MULTILINGUAL_LEADERBOARD_FILE = os.path.join(current_dir, MULTILINGUAL_LEADERBOARD_FILE) | |
with open(MULTILINGUAL_LEADERBOARD_FILE, "r") as f: | |
data = json.load(f) | |
# Initialize lists to store the flattened data | |
rows = [] | |
# Process each country's data | |
for country_data in data: | |
for country, models in country_data.items(): | |
for model_name, metrics in models.items(): | |
row = { | |
'country': country, | |
'model': model_name, | |
} | |
# Add all metrics to the row | |
row.update(metrics) | |
rows.append(row) | |
# Convert to DataFrame | |
df = pd.DataFrame(rows) | |
return df | |
def create_leaderboard_display_one_vs_all(df, target_language, selected_languages): | |
# Filter by target_language if specified | |
if target_language: | |
df = df[df['target_language'] == target_language] | |
# Remove the target_language from selected_languages | |
if target_language in selected_languages: | |
selected_languages = [lang for lang in selected_languages if lang != target_language] | |
# Select only the chosen languages (plus 'model' column) | |
columns_to_show = ['model'] + [language for language in selected_languages if language in df.columns] | |
# Sort by first selected metric by default | |
if selected_languages: | |
df = df.sort_values(by=selected_languages[0], ascending=False) | |
df = df[columns_to_show] | |
# Format numeric columns to 4 decimal places | |
numeric_cols = df.select_dtypes(include=['float64']).columns | |
df[numeric_cols] = df[numeric_cols].round(4) | |
return df, selected_languages | |
def create_leaderboard_display_multilingual(df, selected_country, selected_metrics): | |
# Filter by country if specified | |
if selected_country and selected_country.upper() != 'ALL': | |
# print(f"Filtering leaderboard by country: {selected_country}") | |
df = df[df['country'] == selected_country] | |
df = df.drop(columns=['country']) | |
# Select only the chosen metrics (plus 'model' column) | |
columns_to_show = ['model'] + [metric for metric in selected_metrics if metric in df.columns] | |
else: | |
# Select all metrics (plus 'country' and 'model' columns), if no country is selected or 'All' is selected for ease of comparison | |
columns_to_show = ['model', 'country'] + selected_metrics | |
# Sort by first selected metric by default | |
if selected_metrics: | |
df = df.sort_values(by=selected_metrics[0], ascending=False) | |
df = df[columns_to_show] | |
# Format numeric columns to 4 decimal places | |
numeric_cols = df.select_dtypes(include=['float64']).columns | |
df[numeric_cols] = df[numeric_cols].round(4) | |
return df | |
def update_leaderboard_multilingual(country, selected_metrics): | |
if not selected_metrics: # If no metrics selected, show all | |
selected_metrics = metrics | |
df = load_leaderboard_multilingual() | |
display_df = create_leaderboard_display_multilingual(df, country, selected_metrics) | |
return display_df | |
def update_leaderboard_one_vs_all(target_language, selected_languages): | |
if not selected_languages: # If no language selected, show all defaults | |
selected_languages = default_languages | |
df = load_leaderboard_one_vs_all() | |
display_df, selected_languages = create_leaderboard_display_one_vs_all(df, target_language, selected_languages) | |
# to improve visibility in case the user chooses multiple language leading to many columns, the `model` column must remain fixed | |
# display_df = render_fixed_columns(display_df) | |
return display_df, selected_languages | |
def encode_image_to_base64(image_path): | |
with open(image_path, "rb") as image_file: | |
encoded_string = base64.b64encode(image_file.read()).decode() | |
return encoded_string | |
def create_html_image(image_path): | |
# Get base64 string of image | |
img_base64 = encode_image_to_base64(image_path) | |
# Create HTML string with embedded image and centering styles | |
html_string = f""" | |
<div style="display: flex; justify-content: center; align-items: center; width: 100%; text-align: center;"> | |
<div style="max-width: 800px; margin: auto;"> | |
<img src="data:image/jpeg;base64,{img_base64}" | |
style="max-width: 75%; height: auto; display: block; margin: 0 auto; margin-top: 50px;" | |
alt="Displayed Image"> | |
</div> | |
</div> | |
""" | |
return html_string | |
# Function to render HTML table with fixed 'model' column | |
def render_fixed_columns(df): | |
style = """ | |
<style> | |
.table-container { | |
overflow-x: auto; | |
position: relative; | |
white-space: nowrap; | |
} | |
table { | |
border-collapse: collapse; | |
width: 100%; | |
} | |
th, td { | |
border: 1px solid black; | |
padding: 8px; | |
text-align: left; | |
} | |
th.fixed, td.fixed { | |
position: sticky; | |
left: 0; | |
background-color: white; | |
z-index: 2; | |
} | |
</style> | |
""" | |
table_html = df.to_html(index=False).replace( | |
"<th>model</th>", '<th class="fixed">model</th>' | |
).replace( | |
'<td>', '<td class="fixed">', 1 | |
) | |
return f"{style}<div class='table-container'>{table_html}</div>" |