Spaces:

atlasia
/

Open-Arabic-Dialect-Identification-Leaderboard

Runtime error

App Files Files Community

Open-Arabic-Dialect-Identification-Leaderboard / utils.py

BounharAbdelaziz

Implemented one-vs-all leaderboard

fdfa8a9 verified 4 months ago

raw

history blame

25.1 kB

	import base64
	from huggingface_hub import hf_hub_download
	import fasttext
	import os
	import json
	import pandas as pd
	from sklearn.metrics import (
	precision_score,
	recall_score,
	f1_score,
	confusion_matrix,
	balanced_accuracy_score,
	matthews_corrcoef
	)
	import numpy as np
	from datasets import load_dataset

	# Constants
	MODEL_REPO = "atlasia/Sfaya-Moroccan-Darija-vs-All"
	BIN_FILENAME = "model_multi_v3_2fpr.bin"
	BINARY_LEADERBOARD_FILE = "darija_leaderboard_binary.json"
	MULTILINGUAL_LEADERBOARD_FILE = "darija_leaderboard_multilingual.json"
	DATA_PATH = "atlasia/Arabic-LID-Leaderboard"

	target_label = "Morocco"
	is_binary = False

	# Load test dataset
	test_dataset = load_dataset(DATA_PATH, split='test')

	# Supported dialects
	all_target_languages = list(test_dataset.unique("dialect"))
	supported_dialects = all_target_languages + ['All']
	languages_to_display_one_vs_all = all_target_languages # everything except All

	metrics = [
	'f1_score',
	'precision',
	'recall',
	'specificity',
	'false_positive_rate',
	'false_negative_rate',
	'negative_predictive_value',
	'n_test_samples',
	]

	default_metrics = [
	'f1_score',
	'precision',
	'recall',
	'false_positive_rate',
	'false_negative_rate'
	]

	# default language to display in one-vs-all leaderboard
	default_languages = [
	'Morocco',
	'MSA',
	'Egypt',
	'Algeria',
	'Tunisia',
	'Levantine',
	]

	language_mapping_dict = {
	'ace_Arab': 'Acehnese',
	'acm_Arab': 'Mesopotamia', # 'Gilit Mesopotamian'
	'aeb_Arab': 'Tunisia',
	'ajp_Arab': 'Levantine', # 'South Levantine'
	'apc_Arab': 'Levantine',
	'arb_Arab': 'MSA',
	'arq_Arab': 'Algeria',
	'ars_Arab': 'Saudi', # Najdi is primarily Saudi Arabian
	'ary_Arab': 'Morocco',
	'arz_Arab': 'Egypt',
	'ayp_Arab': 'Mesopotamia', # 'North Mesopotamian'
	'azb_Arab': 'Azerbaijan', # South Azerbaijani pertains to this region
	'bcc_Arab': 'Balochistan', # Southern Balochi is from Balochistan
	'bjn_Arab': 'Indonesia', # Banjar is spoken in Indonesia
	'brh_Arab': 'Pakistan', # Brahui is spoken in Pakistan
	'ckb_Arab': 'Kurdistan', # Central Kurdish is mainly in Iraq
	'fuv_Arab': 'Nigeria', # Hausa States Fulfulde
	'glk_Arab': 'Iran', # Gilaki is spoken in Iran
	'hac_Arab': 'Iran', # Gurani is also primarily spoken in Iran
	'kas_Arab': 'Kashmir',
	'knc_Arab': 'Nigeria', # Central Kanuri is in Nigeria
	'lki_Arab': 'Iran', # Laki is from Iran
	'lrc_Arab': 'Iran', # Northern Luri is from Iran
	'min_Arab': 'Indonesia', # Minangkabau is spoken in Indonesia
	'mzn_Arab': 'Iran', # Mazanderani is spoken in Iran
	'ota_Arab': 'Turkey', # Ottoman Turkish
	'pbt_Arab': 'Afghanistan', # Southern Pashto
	'pnb_Arab': 'Pakistan', # Western Panjabi
	'sdh_Arab': 'Iraq', # Southern Kurdish
	'shu_Arab': 'Chad', # Chadian Arabic
	'skr_Arab': 'Pakistan', # Saraiki
	'snd_Arab': 'Pakistan', # Sindhi
	'sus_Arab': 'Guinea', # Susu
	'tuk_Arab': 'Turkmenistan', # Turkmen
	'uig_Arab': 'Uighur (China)', # Uighur
	'urd_Arab': 'Pakistan', # Urdu
	'uzs_Arab': 'Uzbekistan', # Southern Uzbek
	'zsm_Arab': 'Malaysia' # Standard Malay
	}

	def predict_label(text, model, language_mapping_dict, use_mapping=False):
	# Remove any newline characters and strip whitespace
	text = str(text).strip().replace('\n', ' ')

	if text == '':
	return 'Other'

	try:
	# Get top prediction
	prediction = model.predict(text, 1)

	# Extract label and remove __label__ prefix
	label = prediction[0][0].replace('__label__', '')

	# Extract confidence score
	confidence = prediction[1][0]

	# map label to language using language_mapping_dict
	if use_mapping:
	label = language_mapping_dict.get(label, 'Other')
	return label

	except Exception as e:
	print(f"Error processing text: {text}")
	print(f"Exception: {e}")
	return {'prediction_label': 'Error', 'prediction_confidence': 0.0}

	def compute_classification_metrics(test_dataset):
	"""
	Compute comprehensive classification metrics for each class.

	Args:
	data (pd.DataFrame): DataFrame containing 'dialect' as true labels and 'preds' as predicted labels.

	Returns:
	pd.DataFrame: DataFrame with detailed metrics for each class.
	"""
	# transform the dataset into a DataFrame
	data = pd.DataFrame(test_dataset)
	# Extract true labels and predictions
	true_labels = list(data['dialect'])
	predicted_labels = list(data['preds'])

	# Handle all unique labels
	labels = sorted(list(set(true_labels + predicted_labels)))
	label_to_index = {label: index for index, label in enumerate(labels)}

	# Convert labels to indices
	true_indices = [label_to_index[label] for label in true_labels]
	pred_indices = [label_to_index[label] for label in predicted_labels]

	# Compute basic metrics
	f1_scores = f1_score(true_indices, pred_indices, average=None, labels=range(len(labels)))
	precision_scores = precision_score(true_indices, pred_indices, average=None, labels=range(len(labels)))
	recall_scores = recall_score(true_indices, pred_indices, average=None, labels=range(len(labels)))

	# Compute confusion matrix
	conf_mat = confusion_matrix(true_indices, pred_indices, labels=range(len(labels)))

	# Calculate various metrics per class
	FP = conf_mat.sum(axis=0) - np.diag(conf_mat) # False Positives
	FN = conf_mat.sum(axis=1) - np.diag(conf_mat) # False Negatives
	TP = np.diag(conf_mat) # True Positives
	TN = conf_mat.sum() - (FP + FN + TP) # True Negatives

	# Calculate sample counts per class
	samples_per_class = np.bincount(true_indices, minlength=len(labels))

	# Calculate additional metrics
	with np.errstate(divide='ignore', invalid='ignore'):
	fp_rate = FP / (FP + TN) # False Positive Rate
	fn_rate = FN / (FN + TP) # False Negative Rate
	specificity = TN / (TN + FP) # True Negative Rate
	npv = TN / (TN + FN) # Negative Predictive Value

	# Replace NaN/inf with 0
	metrics = [fp_rate, fn_rate, specificity, npv]
	metrics = [np.nan_to_num(m, nan=0.0, posinf=0.0, neginf=0.0) for m in metrics]
	fp_rate, fn_rate, specificity, npv = metrics

	# Calculate overall metrics
	balanced_acc = balanced_accuracy_score(true_indices, pred_indices)
	mcc = matthews_corrcoef(true_indices, pred_indices)

	# Compile results into a DataFrame
	result_df = pd.DataFrame({
	'country': labels,
	'samples': samples_per_class,
	'f1_score': f1_scores,
	'precision': precision_scores,
	'recall': recall_scores,
	'specificity': specificity,
	'false_positive_rate': fp_rate,
	'false_negative_rate': fn_rate,
	'true_positives': TP,
	'false_positives': FP,
	'true_negatives': TN,
	'false_negatives': FN,
	'negative_predictive_value': npv
	})

	# Sort by number of samples (descending)
	result_df = result_df.sort_values('samples', ascending=False)

	# Calculate and add summary metrics
	summary_metrics = {
	'macro_f1': f1_score(true_indices, pred_indices, average='macro'),
	'weighted_f1': f1_score(true_indices, pred_indices, average='weighted'),
	'micro_f1': f1_score(true_indices, pred_indices, average='micro'),
	'balanced_accuracy': balanced_acc,
	'matthews_correlation': mcc
	}

	# Format all numeric columns to 4 decimal places
	numeric_cols = result_df.select_dtypes(include=[np.number]).columns
	result_df[numeric_cols] = result_df[numeric_cols].round(4)

	print(f'result_df: {result_df}')

	return result_df, summary_metrics

	def make_binary(dialect, target):
	if dialect != target:
	return 'Other'
	return target

	def run_eval_one_vs_all(model, data_test, TARGET_LANG='Morocco', language_mapping_dict=None, use_mapping=False):

	# Predict labels using the model
	print(f"[INFO] Running predictions...")
	data_test['preds'] = data_test['text'].apply(lambda text: predict_label(text, model, language_mapping_dict, use_mapping=use_mapping))

	# map to binary
	df_test_preds = data_test.copy()
	df_test_preds.loc[df_test_preds['dialect'] == TARGET_LANG, 'dialect'] = TARGET_LANG
	df_test_preds.loc[df_test_preds['dialect'] != TARGET_LANG, 'dialect'] = 'Other'

	# compute the fpr per dialect
	dialect_counts = data_test.groupby('dialect')['dialect'].count().reset_index(name='size')
	result_df = pd.merge(dialect_counts, data_test, on='dialect')
	result_df = result_df.groupby(['dialect', 'size', 'preds'])['preds'].count()/result_df.groupby(['dialect', 'size'])['preds'].count()
	result_df.sort_index(ascending=False, level='size', inplace=True)

	# group by dialect and get the false positive rate
	out = result_df.copy()
	out.name = 'false_positive_rate'
	out = out.reset_index()
	out = out[out['preds']==TARGET_LANG].drop(columns=['preds', 'size'])

	print(f'out for TARGET_LANG={TARGET_LANG} \n: {out}')

	return out

	def update_darija_one_vs_all_leaderboard(result_df, model_name, target_lang, BINARY_LEADERBOARD_FILE="darija_leaderboard_binary.json"):
	try:
	with open(BINARY_LEADERBOARD_FILE, "r") as f:
	data = json.load(f)
	except FileNotFoundError:
	data = []

	# Process the results for each dialect/country
	for _, row in result_df.iterrows():
	dialect = row['dialect']
	# Skip 'Other' class, it is considered as the null space
	if dialect == 'Other':
	continue

	# Find existing target_lang entry or create a new one
	target_entry = next((item for item in data if target_lang in item), None)
	if target_entry is None:
	target_entry = {target_lang: {}}
	data.append(target_entry)

	# Get the country-specific data for this target language
	country_data = target_entry[target_lang]

	# Initialize the dialect/country entry if it doesn't exist
	if dialect not in country_data:
	country_data[dialect] = {}

	# Update the model metrics under the model name for the given dialect
	country_data[dialect][model_name] = float(row['false_positive_rate'])

	# # Add the number of test samples, if not already present
	# if "n_test_samples" not in country_data[dialect]:
	# country_data[dialect]["n_test_samples"] = int(row['size'])

	# Save updated leaderboard data
	with open(BINARY_LEADERBOARD_FILE, "w") as f:
	json.dump(data, f, indent=4)

	def handle_evaluation(model_path, model_path_bin, use_mapping=False):

	# download model and get the model path
	model_path = hf_hub_download(repo_id=model_path, filename=model_path_bin, cache_dir=None)

	# Load the trained model
	print(f"[INFO] Loading model from Path: {model_path}, using version {model_path_bin}...")
	model = fasttext.load_model(model_path)

	# Load the evaluation dataset
	print(f"[INFO] Loading evaluation dataset from Path: {DATA_PATH}...")
	eval_dataset = load_dataset(DATA_PATH, split='test')

	# Transform to pandas DataFrame
	print(f"[INFO] Converting evaluation dataset to Pandas DataFrame...")
	df_eval = pd.DataFrame(eval_dataset)

	# run the evaluation
	result_df, _ = run_eval(model, df_eval, language_mapping_dict, use_mapping=use_mapping)
	# set the model name
	model_name = model_path + '/' + model_path_bin

	# update the multilingual leaderboard
	update_darija_multilingual_leaderboard(result_df, model_name, MULTILINGUAL_LEADERBOARD_FILE)

	# # TODO
	for target_lang in all_target_languages:
	result_df_one_vs_all =run_eval_one_vs_all(model, df_eval, TARGET_LANG=target_lang, language_mapping_dict=language_mapping_dict, use_mapping=use_mapping)
	update_darija_one_vs_all_leaderboard(result_df_one_vs_all, model_name, target_lang, BINARY_LEADERBOARD_FILE)

	# load the updated leaderboard tables
	df_multilingual = load_leaderboard_multilingual()
	df_one_vs_all = load_leaderboard_one_vs_all()

	status_message = "Evaluation now ended! 🤗"

	return create_leaderboard_display_multilingual(df_multilingual, target_label, default_metrics), status_message

	def run_eval(model, df_eval, language_mapping_dict=None, use_mapping=False):
	"""Run evaluation on a dataset and compute metrics.

	Args:
	model: The model to evaluate.
	DATA_PATH (str): Path to the dataset.
	is_binary (bool): If True, evaluate as binary classification.
	If False, evaluate as multi-class classification.
	target_label (str): The target class label in binary mode.

	Returns:
	pd.DataFrame: A DataFrame containing evaluation metrics.
	"""

	# Predict labels using the model
	print(f"[INFO] Running predictions...")
	df_eval['preds'] = df_eval['text'].apply(lambda text: predict_label(text, model, language_mapping_dict, use_mapping=use_mapping))

	# now drop the columns that are not needed, i.e. 'text'
	df_eval = df_eval.drop(columns=['text', 'metadata', 'dataset_source'])

	# Compute evaluation metrics
	print(f"[INFO] Computing metrics...")
	result_df, _ = compute_classification_metrics(df_eval)

	# update_darija_multilingual_leaderboard(result_df, model_path, MULTILINGUAL_LEADERBOARD_FILE)

	return result_df, df_eval

	def process_results_file(file, uploaded_model_name, base_path_save="./atlasia/submissions/", default_language='Morocco'):
	try:
	if file is None:
	return "Please upload a file."

	# Clean the model name to be safe for file paths
	uploaded_model_name = uploaded_model_name.strip().replace(" ", "_")
	print(f"[INFO] uploaded_model_name: {uploaded_model_name}")

	# Create the directory for saving submissions
	path_saving = os.path.join(base_path_save, uploaded_model_name)
	os.makedirs(path_saving, exist_ok=True)

	# Define the full path to save the file
	saved_file_path = os.path.join(path_saving, 'submission.csv')

	# Read the uploaded file as DataFrame
	print(f"[INFO] Loading results...")
	df_eval = pd.read_csv(file.name)

	# Save the DataFrame
	print(f"[INFO] Saving the file locally in: {saved_file_path}")
	df_eval.to_csv(saved_file_path, index=False)

	except Exception as e:
	return f"Error processing file: {str(e)}"

	# Compute evaluation metrics
	print(f"[INFO] Computing metrics...")
	result_df, _ = compute_classification_metrics(df_eval)

	# Update the leaderboards
	update_darija_multilingual_leaderboard(result_df, uploaded_model_name, MULTILINGUAL_LEADERBOARD_FILE)

	# TODO: implement this ove_vs_all differently for people only submitting csv file. They need to submit two files, one for multi-lang and the other for one-vs-all
	# result_df_one_vs_all = run_eval_one_vs_all(...)
	# update_darija_one_vs_all_leaderboard(...)

	# update the leaderboard table
	df = load_leaderboard_multilingual()

	return create_leaderboard_display_multilingual(df, default_language, default_metrics)

	def update_darija_multilingual_leaderboard(result_df, model_name, MULTILINGUAL_LEADERBOARD_FILE="darija_leaderboard_multilingual.json"):

	# Load leaderboard data
	current_dir = os.path.dirname(os.path.abspath(__file__))
	MULTILINGUAL_LEADERBOARD_FILE = os.path.join(current_dir, MULTILINGUAL_LEADERBOARD_FILE)

	try:
	with open(MULTILINGUAL_LEADERBOARD_FILE, "r") as f:
	data = json.load(f)
	except FileNotFoundError:
	data = []

	# Process the results for each dialect/country
	for _, row in result_df.iterrows():
	country = row['country']
	# skip 'Other' class, it is considered as the null space
	if country == 'Other':
	continue

	# Create metrics dictionary directly
	metrics = {
	'f1_score': float(row['f1_score']),
	'precision': float(row['precision']),
	'recall': float(row['recall']),
	'specificity': float(row['specificity']),
	'false_positive_rate': float(row['false_positive_rate']),
	'false_negative_rate': float(row['false_negative_rate']),
	'negative_predictive_value': float(row['negative_predictive_value']),
	'n_test_samples': int(row['samples'])
	}

	# Find existing country entry or create new one
	country_entry = next((item for item in data if country in item), None)
	if country_entry is None:
	country_entry = {country: {}}
	data.append(country_entry)

	# Update the model metrics directly under the model name
	if country not in country_entry:
	country_entry[country] = {}
	country_entry[country][model_name] = metrics

	# Save updated leaderboard data
	with open(MULTILINGUAL_LEADERBOARD_FILE, "w") as f:
	json.dump(data, f, indent=4)


	def load_leaderboard_one_vs_all(BINARY_LEADERBOARD_FILE="darija_leaderboard_binary.json"):
	current_dir = os.path.dirname(os.path.abspath(__file__))
	BINARY_LEADERBOARD_FILE = os.path.join(current_dir, BINARY_LEADERBOARD_FILE)

	with open(BINARY_LEADERBOARD_FILE, "r") as f:
	data = json.load(f)

	# Initialize lists to store the flattened data
	rows = []

	# Process each target language's data
	for leaderboard_data in data:
	for target_language, results in leaderboard_data.items():
	for language, models in results.items():

	for model_name, false_positive_rate in models.items():

	row = {
	'target_language': target_language,
	'language': language,
	'model': model_name,
	'false_positive_rate': false_positive_rate,
	}
	# Add all metrics to the row
	rows.append(row)

	# Convert to DataFrame
	df = pd.DataFrame(rows)

	# Pivot the DataFrame to create the desired structure: all languages in columns and models in rows, and each (model, target_language, language) = false_positive_rate
	df_pivot = df.pivot(index=['model', 'target_language'], columns='language', values='false_positive_rate').reset_index()

	# print(f'df_pivot \n: {df_pivot}')

	return df_pivot

	def load_leaderboard_multilingual(MULTILINGUAL_LEADERBOARD_FILE="darija_leaderboard_multilingual.json"):
	current_dir = os.path.dirname(os.path.abspath(__file__))
	MULTILINGUAL_LEADERBOARD_FILE = os.path.join(current_dir, MULTILINGUAL_LEADERBOARD_FILE)

	with open(MULTILINGUAL_LEADERBOARD_FILE, "r") as f:
	data = json.load(f)

	# Initialize lists to store the flattened data
	rows = []

	# Process each country's data
	for country_data in data:
	for country, models in country_data.items():
	for model_name, metrics in models.items():
	row = {
	'country': country,
	'model': model_name,
	}
	# Add all metrics to the row
	row.update(metrics)
	rows.append(row)

	# Convert to DataFrame
	df = pd.DataFrame(rows)
	return df

	def create_leaderboard_display_one_vs_all(df, target_language, selected_languages):

	# Filter by target_language if specified
	if target_language:
	df = df[df['target_language'] == target_language]

	# Remove the target_language from selected_languages
	if target_language in selected_languages:
	selected_languages = [lang for lang in selected_languages if lang != target_language]

	# Select only the chosen languages (plus 'model' column)
	columns_to_show = ['model'] + [language for language in selected_languages if language in df.columns]

	# Sort by first selected metric by default
	if selected_languages:
	df = df.sort_values(by=selected_languages[0], ascending=False)

	df = df[columns_to_show]

	# Format numeric columns to 4 decimal places
	numeric_cols = df.select_dtypes(include=['float64']).columns
	df[numeric_cols] = df[numeric_cols].round(4)

	return df, selected_languages


	def create_leaderboard_display_multilingual(df, selected_country, selected_metrics):
	# Filter by country if specified
	if selected_country and selected_country.upper() != 'ALL':
	# print(f"Filtering leaderboard by country: {selected_country}")
	df = df[df['country'] == selected_country]
	df = df.drop(columns=['country'])

	# Select only the chosen metrics (plus 'model' column)
	columns_to_show = ['model'] + [metric for metric in selected_metrics if metric in df.columns]

	else:
	# Select all metrics (plus 'country' and 'model' columns), if no country is selected or 'All' is selected for ease of comparison
	columns_to_show = ['model', 'country'] + selected_metrics

	# Sort by first selected metric by default
	if selected_metrics:
	df = df.sort_values(by=selected_metrics[0], ascending=False)

	df = df[columns_to_show]

	# Format numeric columns to 4 decimal places
	numeric_cols = df.select_dtypes(include=['float64']).columns
	df[numeric_cols] = df[numeric_cols].round(4)

	return df

	def update_leaderboard_multilingual(country, selected_metrics):
	if not selected_metrics: # If no metrics selected, show all
	selected_metrics = metrics
	df = load_leaderboard_multilingual()
	display_df = create_leaderboard_display_multilingual(df, country, selected_metrics)
	return display_df

	def update_leaderboard_one_vs_all(target_language, selected_languages):
	if not selected_languages: # If no language selected, show all defaults
	selected_languages = default_languages
	df = load_leaderboard_one_vs_all()
	display_df, selected_languages = create_leaderboard_display_one_vs_all(df, target_language, selected_languages)
	# to improve visibility in case the user chooses multiple language leading to many columns, the `model` column must remain fixed
	# display_df = render_fixed_columns(display_df)
	return display_df, selected_languages

	def encode_image_to_base64(image_path):
	with open(image_path, "rb") as image_file:
	encoded_string = base64.b64encode(image_file.read()).decode()
	return encoded_string

	def create_html_image(image_path):
	# Get base64 string of image
	img_base64 = encode_image_to_base64(image_path)

	# Create HTML string with embedded image and centering styles
	html_string = f"""
	<div style="display: flex; justify-content: center; align-items: center; width: 100%; text-align: center;">
	<div style="max-width: 800px; margin: auto;">
	<img src="data:image/jpeg;base64,{img_base64}"
	style="max-width: 75%; height: auto; display: block; margin: 0 auto; margin-top: 50px;"
	alt="Displayed Image">
	</div>
	</div>
	"""
	return html_string

	# Function to render HTML table with fixed 'model' column
	def render_fixed_columns(df):
	style = """
	<style>
	.table-container {
	overflow-x: auto;
	position: relative;
	white-space: nowrap;
	}
	table {
	border-collapse: collapse;
	width: 100%;
	}
	th, td {
	border: 1px solid black;
	padding: 8px;
	text-align: left;
	}
	th.fixed, td.fixed {
	position: sticky;
	left: 0;
	background-color: white;
	z-index: 2;
	}
	</style>
	"""
	table_html = df.to_html(index=False).replace(
	"<th>model</th>", '<th class="fixed">model</th>'
	).replace(
	'<td>', '<td class="fixed">', 1
	)
	return f"{style}<div class='table-container'>{table_html}</div>"