Spaces:

MALIBA-AI
/

bambara-asr-leaderboard

Running

File size: 14,395 Bytes

9ba8fab
 
 
3769468
9ba8fab
 
3769468
6960dc6
da12542
 
f81f1e2
da12542
f81f1e2
 
 
 
da12542
 
ddc83ff
f81f1e2
 
 
 
 
 
 
 
 
9ba8fab
f81f1e2
9ba8fab
 
f81f1e2
ddc83ff
 
3769468
 
 
f81f1e2
3769468
 
 
d415750
3769468
ddc83ff
f81f1e2
 
ddc83ff
f81f1e2
 
ddc83ff
3769468
 
 
f81f1e2
 
 
 
 
 
 
d415750
f81f1e2
3769468
 
 
ddc83ff
3769468
d415750
3769468
 
d415750
3769468
ddc83ff
3769468
d415750
 
f81f1e2
 
 
 
 
 
d415750
f81f1e2
 
 
 
 
ddc83ff
d415750
3769468
ddc83ff
3769468
 
ddc83ff
f81f1e2
 
 
 
 
 
 
 
 
ddc83ff
f81f1e2
 
3769468
d415750
 
f81f1e2
 
3769468
 
 
d415750
 
 
f81f1e2
d415750
ddc83ff
f81f1e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9ba8fab
 
f81f1e2
 
 
 
9ba8fab
f81f1e2
 
 
 
ddc83ff
9ba8fab
ddc83ff
d415750
f81f1e2
3769468
f81f1e2
 
 
 
d415750
29c8f24
f81f1e2
d415750
9ba8fab
d415750
f81f1e2
 
 
 
 
 
 
 
ddc83ff
 
3769468
 
d415750
3769468
f81f1e2
d415750
3769468
f81f1e2
 
 
 
 
 
 
6960dc6
f81f1e2
 
 
 
 
d415750
f81f1e2
d415750
ddc83ff
d415750
f81f1e2
d415750
 
ddc83ff
f81f1e2
ddc83ff
f81f1e2
d415750
 
ddc83ff
f81f1e2
9ba8fab
ddc83ff
9ba8fab
 
f81f1e2
9ba8fab
f81f1e2
 
9ba8fab
 
f81f1e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3769468
9ba8fab
ddc83ff
f81f1e2
c726970
ddc83ff
d415750
9ba8fab
 
d415750
f81f1e2
d415750
 
f81f1e2
 
 
 
 
 
9ba8fab
 
d415750
9ba8fab
f81f1e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d415750
9ba8fab
 
 
 
 
 
ddc83ff
d415750
 
ddc83ff
3769468

import gradio as gr
import pandas as pd
from datasets import load_dataset
from jiwer import wer, cer
import os
from datetime import datetime
import re

from huggingface_hub import login

# Authentication setup
token = os.environ.get("HG_TOKEN")
print(f"Token exists: {token is not None}")
if token:
    print(f"Token length: {len(token)}")
    print(f"Token first few chars: {token[:4]}...")
login(token)

print("Loading dataset...")
try:
    dataset = load_dataset("sudoping01/bambara-speech-recognition-benchmark", name="default", use_auth_token=token)["eval"]
    print(f"Successfully loaded dataset with {len(dataset)} samples")
    references = {row["id"]: row["text"] for row in dataset}
except Exception as e:
    print(f"Error loading dataset: {str(e)}")
    # Fallback in case dataset can't be loaded
    references = {}
    print("WARNING: Using empty references dictionary due to dataset loading error")

# Initialize leaderboard file
leaderboard_file = "leaderboard.csv"
if not os.path.exists(leaderboard_file):
    pd.DataFrame(columns=["submitter", "WER", "CER", "weighted_WER", "weighted_CER", "samples_evaluated", "timestamp"]).to_csv(leaderboard_file, index=False)
else:
    print(f"Loaded existing leaderboard with {len(pd.read_csv(leaderboard_file))} entries")

def normalize_text(text):
    """
    Normalize text by converting to lowercase, removing punctuation, and normalizing whitespace.
    """
    if not isinstance(text, str):
        text = str(text)
    
    text = text.lower()
    
    # Remove punctuation, keeping spaces
    text = re.sub(r'[^\w\s]', '', text)
    
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

def calculate_metrics(predictions_df):
    """
    Calculate WER and CER for each sample and return averages and per-sample results.
    Uses both standard average and length-weighted average.
    """
    per_sample_metrics = []
    total_ref_words = 0
    total_ref_chars = 0

    # Process each sample
    for _, row in predictions_df.iterrows():
        id_val = row["id"]
        if id_val not in references:
            print(f"Warning: ID {id_val} not found in references")
            continue
            
        reference = normalize_text(references[id_val])
        hypothesis = normalize_text(row["text"])
        
        if not reference or not hypothesis:
            print(f"Warning: Empty reference or hypothesis for ID {id_val}")
            continue
            
        reference_words = reference.split()
        reference_chars = list(reference)
        
        # Skip very short references for more stable metrics
        if len(reference_words) < 2:
            print(f"Warning: Reference too short for ID {id_val}, skipping")
            continue
        
        # Store sample info for debugging (first few samples)
        if len(per_sample_metrics) < 5:
            print(f"ID: {id_val}")
            print(f"Reference: '{reference}'")
            print(f"Hypothesis: '{hypothesis}'")
            print(f"Reference words: {reference_words}")
        
        try:
            # Calculate WER and CER
            sample_wer = wer(reference, hypothesis)
            sample_cer = cer(reference, hypothesis)
            
            # Cap metrics at sensible values to prevent outliers
            sample_wer = min(sample_wer, 2.0)  # Cap at 200% WER
            sample_cer = min(sample_cer, 2.0)  # Cap at 200% CER
            
            # For weighted calculations
            total_ref_words += len(reference_words)
            total_ref_chars += len(reference_chars)
            
            if len(per_sample_metrics) < 5:
                print(f"WER: {sample_wer}, CER: {sample_cer}")
            
            per_sample_metrics.append({
                "id": id_val,
                "reference": reference,
                "hypothesis": hypothesis,
                "ref_word_count": len(reference_words),
                "ref_char_count": len(reference_chars),
                "wer": sample_wer,
                "cer": sample_cer
            })
        except Exception as e:
            print(f"Error calculating metrics for ID {id_val}: {str(e)}")
    
    if not per_sample_metrics:
        raise ValueError("No valid samples for WER/CER calculation")
    
    # Calculate standard average metrics
    avg_wer = sum(item["wer"] for item in per_sample_metrics) / len(per_sample_metrics)
    avg_cer = sum(item["cer"] for item in per_sample_metrics) / len(per_sample_metrics)
    
    # Calculate weighted average metrics based on reference length
    weighted_wer = sum(item["wer"] * item["ref_word_count"] for item in per_sample_metrics) / total_ref_words
    weighted_cer = sum(item["cer"] * item["ref_char_count"] for item in per_sample_metrics) / total_ref_chars
    
    print(f"Simple average WER: {avg_wer:.4f}, CER: {avg_cer:.4f}")
    print(f"Weighted average WER: {weighted_wer:.4f}, CER: {weighted_cer:.4f}")
    print(f"Processed {len(per_sample_metrics)} valid samples")
    
    return avg_wer, avg_cer, weighted_wer, weighted_cer, per_sample_metrics

def styled_error(message):
    """Format error messages with red styling"""
    return f"<div style='color: red; font-weight: bold; padding: 10px; border-radius: 5px; background-color: #ffe0e0;'>{message}</div>"

def styled_success(message):
    """Format success messages with green styling"""
    return f"<div style='color: green; font-weight: bold; padding: 10px; border-radius: 5px; background-color: #e0ffe0;'>{message}</div>"

def styled_info(message):
    """Format informational messages with blue styling"""
    return f"<div style='color: #004080; padding: 10px; border-radius: 5px; background-color: #e0f0ff;'>{message}</div>"

def process_submission(submitter_name, csv_file):
    """
    Process a submission CSV, calculate metrics, and update the leaderboard.
    Returns a status message and updated leaderboard.
    """
    try:
        # Validate submitter name
        if not submitter_name or len(submitter_name.strip()) < 3:
            return styled_error("Please provide a valid submitter name (at least 3 characters)"), None
        
        # Read and validate the uploaded CSV
        df = pd.read_csv(csv_file)
        print(f"Processing submission from {submitter_name} with {len(df)} rows")
        
        # Basic validation
        if len(df) == 0:
            return styled_error("Error: Uploaded CSV is empty."), None
        
        if len(df) < 10:
            return styled_error("Error: Submission contains too few samples (minimum 10 required)."), None
            
        if set(df.columns) != {"id", "text"}:
            return styled_error(f"Error: CSV must contain exactly 'id' and 'text' columns. Found: {', '.join(df.columns)}"), None
            
        if df["id"].duplicated().any():
            dup_ids = df[df["id"].duplicated()]["id"].unique()
            return styled_error(f"Error: Duplicate IDs found: {', '.join(map(str, dup_ids[:5]))}."), None

        # Ensure text column contains strings
        df["text"] = df["text"].astype(str)

        # Check for valid references
        if not references:
            return styled_error("Error: Reference dataset could not be loaded. Please try again later."), None

        # Check if IDs match the reference dataset
        missing_ids = set(references.keys()) - set(df["id"])
        extra_ids = set(df["id"]) - set(references.keys())
        
        if missing_ids:
            return styled_error(f"Error: Missing {len(missing_ids)} IDs in submission. First few missing: {', '.join(map(str, list(missing_ids)[:5]))}."), None
            
        if extra_ids:
            return styled_error(f"Error: Found {len(extra_ids)} extra IDs not in reference dataset. First few extra: {', '.join(map(str, list(extra_ids)[:5]))}."), None
        
        # Check for suspicious submissions (high percentage of exact matches)
        exact_matches = 0
        for _, row in df.iterrows():
            if normalize_text(row["text"]) == normalize_text(references[row["id"]]):
                exact_matches += 1
        
        exact_match_ratio = exact_matches / len(df)
        if exact_match_ratio > 0.95:  # If 95% exact matches, likely copying reference
            return styled_error("Suspicious submission: Too many exact matches with reference texts."), None
        
        # Calculate metrics
        try:
            avg_wer, avg_cer, weighted_wer, weighted_cer, detailed_results = calculate_metrics(df)
            
            # Debug information
            print(f"Calculated metrics - WER: {avg_wer:.4f}, CER: {avg_cer:.4f}")
            print(f"Weighted metrics - WER: {weighted_wer:.4f}, CER: {weighted_cer:.4f}")
            print(f"Processed {len(detailed_results)} valid samples")
            
            # Check for suspiciously low values
            if avg_wer < 0.001 or weighted_wer < 0.001:
                print("WARNING: WER is extremely low - likely an error")
                return styled_error("Error: WER calculation yielded suspicious results (near-zero). Please check your submission CSV."), None
                
        except Exception as e:
            print(f"Error in metrics calculation: {str(e)}")
            return styled_error(f"Error calculating metrics: {str(e)}"), None
        
        # Update the leaderboard
        leaderboard = pd.read_csv(leaderboard_file)
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        
        new_entry = pd.DataFrame(
            [[submitter_name, avg_wer, avg_cer, weighted_wer, weighted_cer, len(detailed_results), timestamp]],
            columns=["submitter", "WER", "CER", "weighted_WER", "weighted_CER", "samples_evaluated", "timestamp"]
        )
        
        # Combine with existing leaderboard and keep only the best submission per submitter
        combined = pd.concat([leaderboard, new_entry])
        # Sort by WER (ascending) and get first entry for each submitter
        best_entries = combined.sort_values("WER").groupby("submitter").first().reset_index()
        # Sort the resulting dataframe by WER
        updated_leaderboard = best_entries.sort_values("WER")
        updated_leaderboard.to_csv(leaderboard_file, index=False)
        
        # Create detailed metrics summary
        metrics_summary = f"""
        <h3>Submission Results</h3>
        <table>
            <tr><td><b>Submitter:</b></td><td>{submitter_name}</td></tr>
            <tr><td><b>Word Error Rate (WER):</b></td><td>{avg_wer:.4f}</td></tr>
            <tr><td><b>Character Error Rate (CER):</b></td><td>{avg_cer:.4f}</td></tr>
            <tr><td><b>Weighted WER:</b></td><td>{weighted_wer:.4f}</td></tr>
            <tr><td><b>Weighted CER:</b></td><td>{weighted_cer:.4f}</td></tr>
            <tr><td><b>Samples Evaluated:</b></td><td>{len(detailed_results)}</td></tr>
            <tr><td><b>Submission Time:</b></td><td>{timestamp}</td></tr>
        </table>
        """
        
        return styled_success(f"Submission processed successfully!") + styled_info(metrics_summary), updated_leaderboard
        
    except Exception as e:
        print(f"Error processing submission: {str(e)}")
        return styled_error(f"Error processing submission: {str(e)}"), None

# Create the Gradio interface
with gr.Blocks(title="Bambara ASR Leaderboard") as demo:
    gr.Markdown(
        """
        # Bambara ASR Leaderboard
        
        Upload a CSV file with 'id' and 'text' columns to evaluate your ASR predictions.
        The 'id's must match those in the dataset.
        
        ## Metrics
        - **WER**: Word Error Rate (lower is better) - measures word-level accuracy
        - **CER**: Character Error Rate (lower is better) - measures character-level accuracy
        
        We report both standard averages and length-weighted averages (where longer samples have more influence on the final score).
        """
    )
    
    with gr.Row():
        with gr.Column(scale=1):
            submitter = gr.Textbox(
                label="Submitter Name or Model Name", 
                placeholder="e.g., MALIBA-AI/asr",
                info="Name to appear on the leaderboard"
            )
            csv_upload = gr.File(
                label="Upload CSV File", 
                file_types=[".csv"],
                info="CSV must have 'id' and 'text' columns"
            )
            submit_btn = gr.Button("Submit", variant="primary")
            
        with gr.Column(scale=2):
            with gr.Accordion("Submission Format", open=False):
                gr.Markdown(
                    """
                    ### CSV Format Requirements
                    
                    Your CSV file must:
                    - Have exactly two columns: `id` and `text`
                    - The `id` column must match the IDs in the reference dataset
                    - The `text` column should contain your model's transcriptions
                    
                    Example:
                    ```
                    id,text
                    audio_001,n ye foro ka taa
                    audio_002,i ni ce
                    ```
                    
                    ### Evaluation Process
                    
                    Your submissions are evaluated by:
                    1. Normalizing both reference and predicted text (lowercase, punctuation removal)
                    2. Calculating Word Error Rate (WER) and Character Error Rate (CER)
                    3. Computing both simple average and length-weighted average
                    4. Ranking on the leaderboard by WER (lower is better)
                    
                    Only your best submission is kept on the leaderboard.
                    """
                )
    
    output_msg = gr.HTML(label="Status")
    
    # Leaderboard display
    with gr.Accordion("Leaderboard", open=True):
        leaderboard_display = gr.DataFrame(
            label="Current Standings",
            value=pd.read_csv(leaderboard_file),
            interactive=False
        )
    
    submit_btn.click(
        fn=process_submission,
        inputs=[submitter, csv_upload],
        outputs=[output_msg, leaderboard_display]
    )

# Print startup message
print("Starting Bambara ASR Leaderboard app...")

# Launch the app
if __name__ == "__main__":
    demo.launch(share=True)