Spaces:

MALIBA-AI
/

bambara-asr-leaderboard

Running

App Files Files Community

sudoping01 commited on 20 days ago

Commit

dbe4d6a

verified ·

1 Parent(s): 32130a5

Update app.py

Browse files

Files changed (1) hide show

app.py +184 -45

app.py CHANGED Viewed

@@ -8,24 +8,40 @@ import re
 from huggingface_hub import login
 token = os.environ.get("HG_TOKEN")
-login(token)
 try:
     dataset = load_dataset("sudoping01/bambara-speech-recognition-benchmark", name="default")["eval"]
     references = {row["id"]: row["text"] for row in dataset}
 except Exception as e:
     references = {}
 leaderboard_file = "leaderboard.csv"
 if not os.path.exists(leaderboard_file):
-    pd.DataFrame(columns=["Model_Name", "WER", "CER", "Combined_Score", "timestamp"]).to_csv(leaderboard_file, index=False)
 else:
     leaderboard_df = pd.read_csv(leaderboard_file)
     if "Combined_Score" not in leaderboard_df.columns:
         leaderboard_df["Combined_Score"] = leaderboard_df["WER"] * 0.7 + leaderboard_df["CER"] * 0.3
         leaderboard_df.to_csv(leaderboard_file, index=False)
 def normalize_text(text):
     """Normalize text for WER/CER calculation"""
@@ -62,6 +78,7 @@ def calculate_metrics(predictions_df):
             sample_wer = wer(reference, hypothesis)
             sample_cer = cer(reference, hypothesis)
             sample_wer = min(sample_wer, 2.0)
             sample_cer = min(sample_cer, 2.0)
@@ -77,7 +94,8 @@ def calculate_metrics(predictions_df):
                 "wer": sample_wer,
                 "cer": sample_cer
             })
-        except Exception:
             pass
     if not results:
@@ -98,22 +116,25 @@ def format_as_percentage(value):
 def prepare_leaderboard_for_display(df, sort_by="Combined_Score"):
     """Format leaderboard for display with ranking and percentages"""
-    if len(df) == 0:
         return pd.DataFrame(columns=["Rank", "Model_Name", "WER (%)", "CER (%)", "Combined_Score (%)", "timestamp"])
     display_df = df.copy()
     display_df = display_df.sort_values(sort_by)
     display_df.insert(0, "Rank", range(1, len(display_df) + 1))
     for col in ["WER", "CER", "Combined_Score"]:
         if col in display_df.columns:
             display_df[f"{col} (%)"] = display_df[col].apply(lambda x: f"{x * 100:.2f}")
-            display_df = display_df.drop(col, axis=1)
-    # Removed the clickable model name transformation
     return display_df
@@ -133,10 +154,18 @@ def update_ranking(method):
         return prepare_leaderboard_for_display(current_lb, sort_column)
-    except Exception:
         return pd.DataFrame(columns=["Rank", "Model_Name", "WER (%)", "CER (%)", "Combined_Score (%)", "timestamp"])
 def process_submission(model_name, csv_file):
     try:
         df = pd.read_csv(csv_file)
@@ -162,28 +191,42 @@ def process_submission(model_name, csv_file):
         try:
             avg_wer, avg_cer, weighted_wer, weighted_cer, detailed_results = calculate_metrics(df)
-            # suspiciously low values
             if avg_wer < 0.001:
                 return "Error: WER calculation yielded suspicious results (near-zero). Please check your submission CSV.", None
         except Exception as e:
             return f"Error calculating metrics: {str(e)}", None
         leaderboard = pd.read_csv(leaderboard_file)
         timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
         # Calculate combined score (70% WER, 30% CER)
         combined_score = avg_wer * 0.7 + avg_cer * 0.3
-        new_entry = pd.DataFrame(
-            [[model_name, avg_wer, avg_cer, combined_score, timestamp]],
-            columns=["Model_Name", "WER", "CER", "Combined_Score", "timestamp"]
-        )
-        updated_leaderboard = pd.concat([leaderboard, new_entry]).sort_values("Combined_Score")
         updated_leaderboard.to_csv(leaderboard_file, index=False)
         display_leaderboard = prepare_leaderboard_for_display(updated_leaderboard)
         return f"Submission processed successfully! WER: {format_as_percentage(avg_wer)}, CER: {format_as_percentage(avg_cer)}, Combined Score: {format_as_percentage(combined_score)}", display_leaderboard
@@ -191,29 +234,56 @@ def process_submission(model_name, csv_file):
     except Exception as e:
         return f"Error processing submission: {str(e)}", None
-with gr.Blocks(title="Bambara ASR Leaderboard") as demo:
     gr.Markdown(
         """
         # 🇲🇱 Bambara ASR Leaderboard
-        This leaderboard ranks and evaluates speech recognition models for the Bambara language.
-        Models are ranked based on a combined score of WER and CER metrics.
         """
     )
     with gr.Tabs() as tabs:
-        with gr.TabItem("🏅 Current Rankings"):
-            try:
-                current_leaderboard = pd.read_csv(leaderboard_file)
-                if "Combined_Score" not in current_leaderboard.columns:
-                    current_leaderboard["Combined_Score"] = current_leaderboard["WER"] * 0.7 + current_leaderboard["CER"] * 0.3
-                display_leaderboard = prepare_leaderboard_for_display(current_leaderboard)
-            except Exception:
-                display_leaderboard = pd.DataFrame(columns=["Rank", "Model_Name", "WER (%)", "CER (%)", "Combined_Score (%)", "timestamp"])
-            gr.Markdown("### Current ASR Model Rankings")
             ranking_method = gr.Radio(
                 ["Combined Score (WER 70%, CER 30%)", "WER Only", "CER Only"],
@@ -222,7 +292,7 @@ with gr.Blocks(title="Bambara ASR Leaderboard") as demo:
             )
             leaderboard_view = gr.DataFrame(
-                value=display_leaderboard,
                 interactive=False,
                 label="Models are ranked by selected metric - lower is better"
             )
@@ -233,34 +303,60 @@ with gr.Blocks(title="Bambara ASR Leaderboard") as demo:
                 outputs=[leaderboard_view]
             )
-            gr.Markdown(
-                """
-                ## Metrics Explanation
-                - **WER (%)**: Word Error Rate (lower is better) - measures word-level accuracy
-                - **CER (%)**: Character Error Rate (lower is better) - measures character-level accuracy
-                - **Combined Score (%)**: Weighted average of WER (70%) and CER (30%) - provides a balanced evaluation
-                """
-            )
         with gr.TabItem("📊 Submit New Results"):
             gr.Markdown(
                 """
                 ### Submit a new model for evaluation
-                Upload a CSV file with 'id' and 'text' columns to evaluate your ASR predictions.
-                The 'id's must match those in the reference dataset.
                 """
             )
             with gr.Row():
-                model_name_input = gr.Textbox(label="Model Name", placeholder="e.g., MALIBA-AI/asr")
-                csv_upload = gr.File(label="Upload CSV File", file_types=[".csv"])
-            submit_btn = gr.Button("Submit")
             output_msg = gr.Textbox(label="Status", interactive=False)
             leaderboard_display = gr.DataFrame(
                 label="Updated Leaderboard",
-                value=display_leaderboard,
                 interactive=False
             )
@@ -269,6 +365,49 @@ with gr.Blocks(title="Bambara ASR Leaderboard") as demo:
                 inputs=[model_name_input, csv_upload],
                 outputs=[output_msg, leaderboard_display]
             )
 if __name__ == "__main__":
     demo.launch()

 from huggingface_hub import login
+# Login to Hugging Face Hub (if token is available)
 token = os.environ.get("HG_TOKEN")
+if token:
+    login(token)
+# Load reference dataset
 try:
     dataset = load_dataset("sudoping01/bambara-speech-recognition-benchmark", name="default")["eval"]
     references = {row["id"]: row["text"] for row in dataset}
+    print(f"Loaded {len(references)} reference transcriptions")
 except Exception as e:
+    print(f"Error loading dataset: {str(e)}")
     references = {}
+# Initialize or load the leaderboard file
 leaderboard_file = "leaderboard.csv"
 if not os.path.exists(leaderboard_file):
+    # Create a new leaderboard with sample data for testing
+    sample_data = [
+        ["MALIBA-AI/bambara-asr-v1", 0.2264, 0.1094, 0.1922, "2025-03-15 10:30:45"],
+        ["whisper-large-v3-bambara", 0.3120, 0.1870, 0.2745, "2025-02-20 14:22:33"]
+    ]
+    pd.DataFrame(sample_data,
+                 columns=["Model_Name", "WER", "CER", "Combined_Score", "timestamp"]).to_csv(leaderboard_file, index=False)
+    print(f"Created new leaderboard file with sample data")
 else:
     leaderboard_df = pd.read_csv(leaderboard_file)
+    # Ensure the Combined_Score column exists
     if "Combined_Score" not in leaderboard_df.columns:
         leaderboard_df["Combined_Score"] = leaderboard_df["WER"] * 0.7 + leaderboard_df["CER"] * 0.3
         leaderboard_df.to_csv(leaderboard_file, index=False)
+        print(f"Added Combined_Score column to existing leaderboard")
+    print(f"Loaded leaderboard with {len(leaderboard_df)} entries")
 def normalize_text(text):
     """Normalize text for WER/CER calculation"""
             sample_wer = wer(reference, hypothesis)
             sample_cer = cer(reference, hypothesis)
+            # Cap extreme values to prevent outliers from skewing results
             sample_wer = min(sample_wer, 2.0)
             sample_cer = min(sample_cer, 2.0)
                 "wer": sample_wer,
                 "cer": sample_cer
             })
+        except Exception as e:
+            print(f"Error processing sample {id_val}: {str(e)}")
             pass
     if not results:
 def prepare_leaderboard_for_display(df, sort_by="Combined_Score"):
     """Format leaderboard for display with ranking and percentages"""
+    if df is None or len(df) == 0:
         return pd.DataFrame(columns=["Rank", "Model_Name", "WER (%)", "CER (%)", "Combined_Score (%)", "timestamp"])
+    # Make a copy to avoid modifying the original
     display_df = df.copy()
+    # Sort by the selected metric (lower is better)
     display_df = display_df.sort_values(sort_by)
+    # Add ranking column
     display_df.insert(0, "Rank", range(1, len(display_df) + 1))
+    # Format numeric columns as percentages
     for col in ["WER", "CER", "Combined_Score"]:
         if col in display_df.columns:
             display_df[f"{col} (%)"] = display_df[col].apply(lambda x: f"{x * 100:.2f}")
+    # Keep both the raw values and percentage displays
+    # This allows for proper sorting while showing formatted values
     return display_df
         return prepare_leaderboard_for_display(current_lb, sort_column)
+    except Exception as e:
+        print(f"Error updating ranking: {str(e)}")
         return pd.DataFrame(columns=["Rank", "Model_Name", "WER (%)", "CER (%)", "Combined_Score (%)", "timestamp"])
 def process_submission(model_name, csv_file):
+    """Process a new model submission"""
+    if not model_name or not model_name.strip():
+        return "Error: Please provide a model name.", None
+    if not csv_file:
+        return "Error: Please upload a CSV file.", None
     try:
         df = pd.read_csv(csv_file)
         try:
             avg_wer, avg_cer, weighted_wer, weighted_cer, detailed_results = calculate_metrics(df)
+            # Check for suspiciously low values
             if avg_wer < 0.001:
                 return "Error: WER calculation yielded suspicious results (near-zero). Please check your submission CSV.", None
         except Exception as e:
             return f"Error calculating metrics: {str(e)}", None
+        # Load existing leaderboard
         leaderboard = pd.read_csv(leaderboard_file)
         timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
         # Calculate combined score (70% WER, 30% CER)
         combined_score = avg_wer * 0.7 + avg_cer * 0.3
+        # Check if model already exists
+        if model_name in leaderboard["Model_Name"].values:
+            # Update existing entry
+            idx = leaderboard[leaderboard["Model_Name"] == model_name].index
+            leaderboard.loc[idx, "WER"] = avg_wer
+            leaderboard.loc[idx, "CER"] = avg_cer
+            leaderboard.loc[idx, "Combined_Score"] = combined_score
+            leaderboard.loc[idx, "timestamp"] = timestamp
+            updated_leaderboard = leaderboard
+        else:
+            # Add new entry
+            new_entry = pd.DataFrame(
+                [[model_name, avg_wer, avg_cer, combined_score, timestamp]],
+                columns=["Model_Name", "WER", "CER", "Combined_Score", "timestamp"]
+            )
+            updated_leaderboard = pd.concat([leaderboard, new_entry])
+        # Sort and save updated leaderboard
+        updated_leaderboard = updated_leaderboard.sort_values("Combined_Score")
         updated_leaderboard.to_csv(leaderboard_file, index=False)
+        # Prepare for display
         display_leaderboard = prepare_leaderboard_for_display(updated_leaderboard)
         return f"Submission processed successfully! WER: {format_as_percentage(avg_wer)}, CER: {format_as_percentage(avg_cer)}, Combined Score: {format_as_percentage(combined_score)}", display_leaderboard
     except Exception as e:
         return f"Error processing submission: {str(e)}", None
+def get_current_leaderboard():
+    """Get the current leaderboard data for display"""
+    try:
+        if os.path.exists(leaderboard_file):
+            current_leaderboard = pd.read_csv(leaderboard_file)
+            if "Combined_Score" not in current_leaderboard.columns:
+                current_leaderboard["Combined_Score"] = current_leaderboard["WER"] * 0.7 + current_leaderboard["CER"] * 0.3
+                current_leaderboard.to_csv(leaderboard_file, index=False)
+            return current_leaderboard
+        else:
+            return pd.DataFrame(columns=["Model_Name", "WER", "CER", "Combined_Score", "timestamp"])
+    except Exception as e:
+        print(f"Error getting leaderboard: {str(e)}")
+        return pd.DataFrame(columns=["Model_Name", "WER", "CER", "Combined_Score", "timestamp"])
+def create_leaderboard_table():
+    """Create and format the leaderboard table for display"""
+    leaderboard_data = get_current_leaderboard()
+    return prepare_leaderboard_for_display(leaderboard_data)
+with gr.Blocks(title="Bambara ASR Leaderboard", theme=gr.themes.Soft()) as demo:
     gr.Markdown(
         """
         # 🇲🇱 Bambara ASR Leaderboard
+        This leaderboard tracks and evaluates speech recognition models for the Bambara language.
+        Models are ranked based on Word Error Rate (WER), Character Error Rate (CER), and a combined score.
+        ## Current Models Performance
         """
     )
+    current_data = get_current_leaderboard()
+    # Highlight top-performing model
+    if len(current_data) > 0:
+        best_model = current_data.sort_values("Combined_Score").iloc[0]
+        gr.Markdown(f"""
+        ### 🏆 Current Best Model: **{best_model['Model_Name']}**
+        * WER: **{best_model['WER']*100:.2f}%**
+        * CER: **{best_model['CER']*100:.2f}%**
+        * Combined Score: **{best_model['Combined_Score']*100:.2f}%**
+        """)
     with gr.Tabs() as tabs:
+        with gr.TabItem("🏅 Model Rankings"):
+            # Pre-load the leaderboard data
+            initial_leaderboard = create_leaderboard_table()
             ranking_method = gr.Radio(
                 ["Combined Score (WER 70%, CER 30%)", "WER Only", "CER Only"],
             )
             leaderboard_view = gr.DataFrame(
+                value=initial_leaderboard,
                 interactive=False,
                 label="Models are ranked by selected metric - lower is better"
             )
                 outputs=[leaderboard_view]
             )
+            with gr.Accordion("Metrics Explanation", open=False):
+                gr.Markdown(
+                    """
+                    ## Understanding ASR Metrics
+                    ### Word Error Rate (WER)
+                    WER measures how accurately the ASR system recognizes whole words:
+                    * Lower values indicate better performance
+                    * Calculated as: (Substitutions + Insertions + Deletions) / Total Words
+                    * A WER of 0% means perfect transcription
+                    * A WER of 20% means approximately 1 in 5 words contains an error
+                    ### Character Error Rate (CER)
+                    CER measures accuracy at the character level:
+                    * More fine-grained than WER
+                    * Better at capturing partial word matches
+                    * Particularly useful for agglutinative languages like Bambara
+                    ### Combined Score
+                    * Weighted average: 70% WER + 30% CER
+                    * Provides a balanced evaluation of model performance
+                    * Used as the primary ranking metric
+                    """
+                )
         with gr.TabItem("📊 Submit New Results"):
             gr.Markdown(
                 """
                 ### Submit a new model for evaluation
+                Upload a CSV file with the following format:
+                * Must contain exactly two columns: 'id' and 'text'
+                * The 'id' column should match the reference dataset IDs
+                * The 'text' column should contain your model's transcriptions
                 """
             )
             with gr.Row():
+                model_name_input = gr.Textbox(
+                    label="Model Name",
+                    placeholder="e.g., MALIBA-AI/bambara-asr",
+                    info="Use a descriptive name to identify your model"
+                )
+                csv_upload = gr.File(
+                    label="Upload CSV File",
+                    file_types=[".csv"],
+                    info="CSV with columns: id, text"
+                )
+            submit_btn = gr.Button("Submit", variant="primary")
             output_msg = gr.Textbox(label="Status", interactive=False)
             leaderboard_display = gr.DataFrame(
                 label="Updated Leaderboard",
+                value=initial_leaderboard,
                 interactive=False
             )
                 inputs=[model_name_input, csv_upload],
                 outputs=[output_msg, leaderboard_display]
             )
+        with gr.TabItem("📝 Benchmark Dataset"):
+            gr.Markdown(
+                """
+                ## About the Benchmark Dataset
+                This leaderboard uses the **[sudoping01/bambara-speech-recognition-benchmark](https://huggingface.co/datasets/sudoping01/bambara-speech-recognition-benchmark)** dataset:
+                * Contains diverse Bambara speech samples
+                * Includes various speakers, accents, and dialects
+                * Covers different speech styles and recording conditions
+                * Professionally transcribed and validated
+                ### How to Generate Predictions
+                To submit results to this leaderboard:
+                1. Download the audio files from the benchmark dataset
+                2. Run your ASR model on the audio files
+                3. Generate a CSV file with 'id' and 'text' columns
+                4. Submit your results using the form in the "Submit New Results" tab
+                ### Evaluation Guidelines
+                * Text is normalized (lowercase, punctuation removed) before metrics calculation
+                * Extreme outliers are capped to prevent skewing results
+                * All submissions are validated for format and completeness
+                """
+            )
+    gr.Markdown(
+        """
+        ---
+        ### About MALIBA-AI
+        **MALIBA-AI: Empowering Mali's Future Through Community-Driven AI Innovation**
+        *"No Malian Language Left Behind"*
+        This leaderboard is maintained by the MALIBA-AI initiative to track progress in Bambara speech recognition technology.
+        For more information, visit [MALIBA-AI on Hugging Face](https://huggingface.co/MALIBA-AI).
+        """
+    )
 if __name__ == "__main__":
     demo.launch()