sudoping01 commited on
Commit
dbe4d6a
Β·
verified Β·
1 Parent(s): 32130a5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +184 -45
app.py CHANGED
@@ -8,24 +8,40 @@ import re
8
 
9
  from huggingface_hub import login
10
 
 
11
  token = os.environ.get("HG_TOKEN")
12
- login(token)
 
13
 
 
14
  try:
15
  dataset = load_dataset("sudoping01/bambara-speech-recognition-benchmark", name="default")["eval"]
16
  references = {row["id"]: row["text"] for row in dataset}
 
17
  except Exception as e:
 
18
  references = {}
19
 
 
20
  leaderboard_file = "leaderboard.csv"
21
  if not os.path.exists(leaderboard_file):
22
- pd.DataFrame(columns=["Model_Name", "WER", "CER", "Combined_Score", "timestamp"]).to_csv(leaderboard_file, index=False)
 
 
 
 
 
 
 
23
  else:
24
  leaderboard_df = pd.read_csv(leaderboard_file)
25
 
 
26
  if "Combined_Score" not in leaderboard_df.columns:
27
  leaderboard_df["Combined_Score"] = leaderboard_df["WER"] * 0.7 + leaderboard_df["CER"] * 0.3
28
  leaderboard_df.to_csv(leaderboard_file, index=False)
 
 
29
 
30
  def normalize_text(text):
31
  """Normalize text for WER/CER calculation"""
@@ -62,6 +78,7 @@ def calculate_metrics(predictions_df):
62
  sample_wer = wer(reference, hypothesis)
63
  sample_cer = cer(reference, hypothesis)
64
 
 
65
  sample_wer = min(sample_wer, 2.0)
66
  sample_cer = min(sample_cer, 2.0)
67
 
@@ -77,7 +94,8 @@ def calculate_metrics(predictions_df):
77
  "wer": sample_wer,
78
  "cer": sample_cer
79
  })
80
- except Exception:
 
81
  pass
82
 
83
  if not results:
@@ -98,22 +116,25 @@ def format_as_percentage(value):
98
 
99
  def prepare_leaderboard_for_display(df, sort_by="Combined_Score"):
100
  """Format leaderboard for display with ranking and percentages"""
101
- if len(df) == 0:
102
  return pd.DataFrame(columns=["Rank", "Model_Name", "WER (%)", "CER (%)", "Combined_Score (%)", "timestamp"])
103
 
104
-
105
  display_df = df.copy()
106
 
 
107
  display_df = display_df.sort_values(sort_by)
108
 
 
109
  display_df.insert(0, "Rank", range(1, len(display_df) + 1))
110
 
 
111
  for col in ["WER", "CER", "Combined_Score"]:
112
  if col in display_df.columns:
113
  display_df[f"{col} (%)"] = display_df[col].apply(lambda x: f"{x * 100:.2f}")
114
- display_df = display_df.drop(col, axis=1)
115
 
116
- # Removed the clickable model name transformation
 
117
 
118
  return display_df
119
 
@@ -133,10 +154,18 @@ def update_ranking(method):
133
 
134
  return prepare_leaderboard_for_display(current_lb, sort_column)
135
 
136
- except Exception:
 
137
  return pd.DataFrame(columns=["Rank", "Model_Name", "WER (%)", "CER (%)", "Combined_Score (%)", "timestamp"])
138
 
139
  def process_submission(model_name, csv_file):
 
 
 
 
 
 
 
140
  try:
141
  df = pd.read_csv(csv_file)
142
 
@@ -162,28 +191,42 @@ def process_submission(model_name, csv_file):
162
  try:
163
  avg_wer, avg_cer, weighted_wer, weighted_cer, detailed_results = calculate_metrics(df)
164
 
165
- # suspiciously low values
166
  if avg_wer < 0.001:
167
  return "Error: WER calculation yielded suspicious results (near-zero). Please check your submission CSV.", None
168
 
169
  except Exception as e:
170
  return f"Error calculating metrics: {str(e)}", None
171
 
 
172
  leaderboard = pd.read_csv(leaderboard_file)
173
  timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
174
 
175
  # Calculate combined score (70% WER, 30% CER)
176
  combined_score = avg_wer * 0.7 + avg_cer * 0.3
177
 
178
- new_entry = pd.DataFrame(
179
- [[model_name, avg_wer, avg_cer, combined_score, timestamp]],
180
- columns=["Model_Name", "WER", "CER", "Combined_Score", "timestamp"]
181
- )
 
 
 
 
 
 
 
 
 
 
 
 
182
 
183
-
184
- updated_leaderboard = pd.concat([leaderboard, new_entry]).sort_values("Combined_Score")
185
  updated_leaderboard.to_csv(leaderboard_file, index=False)
186
 
 
187
  display_leaderboard = prepare_leaderboard_for_display(updated_leaderboard)
188
 
189
  return f"Submission processed successfully! WER: {format_as_percentage(avg_wer)}, CER: {format_as_percentage(avg_cer)}, Combined Score: {format_as_percentage(combined_score)}", display_leaderboard
@@ -191,29 +234,56 @@ def process_submission(model_name, csv_file):
191
  except Exception as e:
192
  return f"Error processing submission: {str(e)}", None
193
 
194
- with gr.Blocks(title="Bambara ASR Leaderboard") as demo:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
  gr.Markdown(
196
  """
197
  # πŸ‡²πŸ‡± Bambara ASR Leaderboard
198
 
199
- This leaderboard ranks and evaluates speech recognition models for the Bambara language.
200
- Models are ranked based on a combined score of WER and CER metrics.
 
 
201
  """
202
  )
203
 
 
 
 
 
 
 
 
 
 
 
 
 
204
  with gr.Tabs() as tabs:
205
- with gr.TabItem("πŸ… Current Rankings"):
206
- try:
207
- current_leaderboard = pd.read_csv(leaderboard_file)
208
-
209
- if "Combined_Score" not in current_leaderboard.columns:
210
- current_leaderboard["Combined_Score"] = current_leaderboard["WER"] * 0.7 + current_leaderboard["CER"] * 0.3
211
-
212
- display_leaderboard = prepare_leaderboard_for_display(current_leaderboard)
213
- except Exception:
214
- display_leaderboard = pd.DataFrame(columns=["Rank", "Model_Name", "WER (%)", "CER (%)", "Combined_Score (%)", "timestamp"])
215
-
216
- gr.Markdown("### Current ASR Model Rankings")
217
 
218
  ranking_method = gr.Radio(
219
  ["Combined Score (WER 70%, CER 30%)", "WER Only", "CER Only"],
@@ -222,7 +292,7 @@ with gr.Blocks(title="Bambara ASR Leaderboard") as demo:
222
  )
223
 
224
  leaderboard_view = gr.DataFrame(
225
- value=display_leaderboard,
226
  interactive=False,
227
  label="Models are ranked by selected metric - lower is better"
228
  )
@@ -233,34 +303,60 @@ with gr.Blocks(title="Bambara ASR Leaderboard") as demo:
233
  outputs=[leaderboard_view]
234
  )
235
 
236
- gr.Markdown(
237
- """
238
- ## Metrics Explanation
239
- - **WER (%)**: Word Error Rate (lower is better) - measures word-level accuracy
240
- - **CER (%)**: Character Error Rate (lower is better) - measures character-level accuracy
241
- - **Combined Score (%)**: Weighted average of WER (70%) and CER (30%) - provides a balanced evaluation
242
- """
243
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
 
245
  with gr.TabItem("πŸ“Š Submit New Results"):
246
  gr.Markdown(
247
  """
248
  ### Submit a new model for evaluation
249
 
250
- Upload a CSV file with 'id' and 'text' columns to evaluate your ASR predictions.
251
- The 'id's must match those in the reference dataset.
 
 
252
  """
253
  )
254
 
255
  with gr.Row():
256
- model_name_input = gr.Textbox(label="Model Name", placeholder="e.g., MALIBA-AI/asr")
257
- csv_upload = gr.File(label="Upload CSV File", file_types=[".csv"])
 
 
 
 
 
 
 
 
258
 
259
- submit_btn = gr.Button("Submit")
260
  output_msg = gr.Textbox(label="Status", interactive=False)
261
  leaderboard_display = gr.DataFrame(
262
  label="Updated Leaderboard",
263
- value=display_leaderboard,
264
  interactive=False
265
  )
266
 
@@ -269,6 +365,49 @@ with gr.Blocks(title="Bambara ASR Leaderboard") as demo:
269
  inputs=[model_name_input, csv_upload],
270
  outputs=[output_msg, leaderboard_display]
271
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272
 
273
  if __name__ == "__main__":
274
  demo.launch()
 
8
 
9
  from huggingface_hub import login
10
 
11
+ # Login to Hugging Face Hub (if token is available)
12
  token = os.environ.get("HG_TOKEN")
13
+ if token:
14
+ login(token)
15
 
16
+ # Load reference dataset
17
  try:
18
  dataset = load_dataset("sudoping01/bambara-speech-recognition-benchmark", name="default")["eval"]
19
  references = {row["id"]: row["text"] for row in dataset}
20
+ print(f"Loaded {len(references)} reference transcriptions")
21
  except Exception as e:
22
+ print(f"Error loading dataset: {str(e)}")
23
  references = {}
24
 
25
+ # Initialize or load the leaderboard file
26
  leaderboard_file = "leaderboard.csv"
27
  if not os.path.exists(leaderboard_file):
28
+ # Create a new leaderboard with sample data for testing
29
+ sample_data = [
30
+ ["MALIBA-AI/bambara-asr-v1", 0.2264, 0.1094, 0.1922, "2025-03-15 10:30:45"],
31
+ ["whisper-large-v3-bambara", 0.3120, 0.1870, 0.2745, "2025-02-20 14:22:33"]
32
+ ]
33
+ pd.DataFrame(sample_data,
34
+ columns=["Model_Name", "WER", "CER", "Combined_Score", "timestamp"]).to_csv(leaderboard_file, index=False)
35
+ print(f"Created new leaderboard file with sample data")
36
  else:
37
  leaderboard_df = pd.read_csv(leaderboard_file)
38
 
39
+ # Ensure the Combined_Score column exists
40
  if "Combined_Score" not in leaderboard_df.columns:
41
  leaderboard_df["Combined_Score"] = leaderboard_df["WER"] * 0.7 + leaderboard_df["CER"] * 0.3
42
  leaderboard_df.to_csv(leaderboard_file, index=False)
43
+ print(f"Added Combined_Score column to existing leaderboard")
44
+ print(f"Loaded leaderboard with {len(leaderboard_df)} entries")
45
 
46
  def normalize_text(text):
47
  """Normalize text for WER/CER calculation"""
 
78
  sample_wer = wer(reference, hypothesis)
79
  sample_cer = cer(reference, hypothesis)
80
 
81
+ # Cap extreme values to prevent outliers from skewing results
82
  sample_wer = min(sample_wer, 2.0)
83
  sample_cer = min(sample_cer, 2.0)
84
 
 
94
  "wer": sample_wer,
95
  "cer": sample_cer
96
  })
97
+ except Exception as e:
98
+ print(f"Error processing sample {id_val}: {str(e)}")
99
  pass
100
 
101
  if not results:
 
116
 
117
  def prepare_leaderboard_for_display(df, sort_by="Combined_Score"):
118
  """Format leaderboard for display with ranking and percentages"""
119
+ if df is None or len(df) == 0:
120
  return pd.DataFrame(columns=["Rank", "Model_Name", "WER (%)", "CER (%)", "Combined_Score (%)", "timestamp"])
121
 
122
+ # Make a copy to avoid modifying the original
123
  display_df = df.copy()
124
 
125
+ # Sort by the selected metric (lower is better)
126
  display_df = display_df.sort_values(sort_by)
127
 
128
+ # Add ranking column
129
  display_df.insert(0, "Rank", range(1, len(display_df) + 1))
130
 
131
+ # Format numeric columns as percentages
132
  for col in ["WER", "CER", "Combined_Score"]:
133
  if col in display_df.columns:
134
  display_df[f"{col} (%)"] = display_df[col].apply(lambda x: f"{x * 100:.2f}")
 
135
 
136
+ # Keep both the raw values and percentage displays
137
+ # This allows for proper sorting while showing formatted values
138
 
139
  return display_df
140
 
 
154
 
155
  return prepare_leaderboard_for_display(current_lb, sort_column)
156
 
157
+ except Exception as e:
158
+ print(f"Error updating ranking: {str(e)}")
159
  return pd.DataFrame(columns=["Rank", "Model_Name", "WER (%)", "CER (%)", "Combined_Score (%)", "timestamp"])
160
 
161
  def process_submission(model_name, csv_file):
162
+ """Process a new model submission"""
163
+ if not model_name or not model_name.strip():
164
+ return "Error: Please provide a model name.", None
165
+
166
+ if not csv_file:
167
+ return "Error: Please upload a CSV file.", None
168
+
169
  try:
170
  df = pd.read_csv(csv_file)
171
 
 
191
  try:
192
  avg_wer, avg_cer, weighted_wer, weighted_cer, detailed_results = calculate_metrics(df)
193
 
194
+ # Check for suspiciously low values
195
  if avg_wer < 0.001:
196
  return "Error: WER calculation yielded suspicious results (near-zero). Please check your submission CSV.", None
197
 
198
  except Exception as e:
199
  return f"Error calculating metrics: {str(e)}", None
200
 
201
+ # Load existing leaderboard
202
  leaderboard = pd.read_csv(leaderboard_file)
203
  timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
204
 
205
  # Calculate combined score (70% WER, 30% CER)
206
  combined_score = avg_wer * 0.7 + avg_cer * 0.3
207
 
208
+ # Check if model already exists
209
+ if model_name in leaderboard["Model_Name"].values:
210
+ # Update existing entry
211
+ idx = leaderboard[leaderboard["Model_Name"] == model_name].index
212
+ leaderboard.loc[idx, "WER"] = avg_wer
213
+ leaderboard.loc[idx, "CER"] = avg_cer
214
+ leaderboard.loc[idx, "Combined_Score"] = combined_score
215
+ leaderboard.loc[idx, "timestamp"] = timestamp
216
+ updated_leaderboard = leaderboard
217
+ else:
218
+ # Add new entry
219
+ new_entry = pd.DataFrame(
220
+ [[model_name, avg_wer, avg_cer, combined_score, timestamp]],
221
+ columns=["Model_Name", "WER", "CER", "Combined_Score", "timestamp"]
222
+ )
223
+ updated_leaderboard = pd.concat([leaderboard, new_entry])
224
 
225
+ # Sort and save updated leaderboard
226
+ updated_leaderboard = updated_leaderboard.sort_values("Combined_Score")
227
  updated_leaderboard.to_csv(leaderboard_file, index=False)
228
 
229
+ # Prepare for display
230
  display_leaderboard = prepare_leaderboard_for_display(updated_leaderboard)
231
 
232
  return f"Submission processed successfully! WER: {format_as_percentage(avg_wer)}, CER: {format_as_percentage(avg_cer)}, Combined Score: {format_as_percentage(combined_score)}", display_leaderboard
 
234
  except Exception as e:
235
  return f"Error processing submission: {str(e)}", None
236
 
237
+ def get_current_leaderboard():
238
+ """Get the current leaderboard data for display"""
239
+ try:
240
+ if os.path.exists(leaderboard_file):
241
+ current_leaderboard = pd.read_csv(leaderboard_file)
242
+
243
+ if "Combined_Score" not in current_leaderboard.columns:
244
+ current_leaderboard["Combined_Score"] = current_leaderboard["WER"] * 0.7 + current_leaderboard["CER"] * 0.3
245
+ current_leaderboard.to_csv(leaderboard_file, index=False)
246
+
247
+ return current_leaderboard
248
+ else:
249
+ return pd.DataFrame(columns=["Model_Name", "WER", "CER", "Combined_Score", "timestamp"])
250
+ except Exception as e:
251
+ print(f"Error getting leaderboard: {str(e)}")
252
+ return pd.DataFrame(columns=["Model_Name", "WER", "CER", "Combined_Score", "timestamp"])
253
+
254
+ def create_leaderboard_table():
255
+ """Create and format the leaderboard table for display"""
256
+ leaderboard_data = get_current_leaderboard()
257
+ return prepare_leaderboard_for_display(leaderboard_data)
258
+
259
+ with gr.Blocks(title="Bambara ASR Leaderboard", theme=gr.themes.Soft()) as demo:
260
  gr.Markdown(
261
  """
262
  # πŸ‡²πŸ‡± Bambara ASR Leaderboard
263
 
264
+ This leaderboard tracks and evaluates speech recognition models for the Bambara language.
265
+ Models are ranked based on Word Error Rate (WER), Character Error Rate (CER), and a combined score.
266
+
267
+ ## Current Models Performance
268
  """
269
  )
270
 
271
+ current_data = get_current_leaderboard()
272
+
273
+ # Highlight top-performing model
274
+ if len(current_data) > 0:
275
+ best_model = current_data.sort_values("Combined_Score").iloc[0]
276
+ gr.Markdown(f"""
277
+ ### πŸ† Current Best Model: **{best_model['Model_Name']}**
278
+ * WER: **{best_model['WER']*100:.2f}%**
279
+ * CER: **{best_model['CER']*100:.2f}%**
280
+ * Combined Score: **{best_model['Combined_Score']*100:.2f}%**
281
+ """)
282
+
283
  with gr.Tabs() as tabs:
284
+ with gr.TabItem("πŸ… Model Rankings"):
285
+ # Pre-load the leaderboard data
286
+ initial_leaderboard = create_leaderboard_table()
 
 
 
 
 
 
 
 
 
287
 
288
  ranking_method = gr.Radio(
289
  ["Combined Score (WER 70%, CER 30%)", "WER Only", "CER Only"],
 
292
  )
293
 
294
  leaderboard_view = gr.DataFrame(
295
+ value=initial_leaderboard,
296
  interactive=False,
297
  label="Models are ranked by selected metric - lower is better"
298
  )
 
303
  outputs=[leaderboard_view]
304
  )
305
 
306
+ with gr.Accordion("Metrics Explanation", open=False):
307
+ gr.Markdown(
308
+ """
309
+ ## Understanding ASR Metrics
310
+
311
+ ### Word Error Rate (WER)
312
+ WER measures how accurately the ASR system recognizes whole words:
313
+ * Lower values indicate better performance
314
+ * Calculated as: (Substitutions + Insertions + Deletions) / Total Words
315
+ * A WER of 0% means perfect transcription
316
+ * A WER of 20% means approximately 1 in 5 words contains an error
317
+
318
+ ### Character Error Rate (CER)
319
+ CER measures accuracy at the character level:
320
+ * More fine-grained than WER
321
+ * Better at capturing partial word matches
322
+ * Particularly useful for agglutinative languages like Bambara
323
+
324
+ ### Combined Score
325
+ * Weighted average: 70% WER + 30% CER
326
+ * Provides a balanced evaluation of model performance
327
+ * Used as the primary ranking metric
328
+ """
329
+ )
330
 
331
  with gr.TabItem("πŸ“Š Submit New Results"):
332
  gr.Markdown(
333
  """
334
  ### Submit a new model for evaluation
335
 
336
+ Upload a CSV file with the following format:
337
+ * Must contain exactly two columns: 'id' and 'text'
338
+ * The 'id' column should match the reference dataset IDs
339
+ * The 'text' column should contain your model's transcriptions
340
  """
341
  )
342
 
343
  with gr.Row():
344
+ model_name_input = gr.Textbox(
345
+ label="Model Name",
346
+ placeholder="e.g., MALIBA-AI/bambara-asr",
347
+ info="Use a descriptive name to identify your model"
348
+ )
349
+ csv_upload = gr.File(
350
+ label="Upload CSV File",
351
+ file_types=[".csv"],
352
+ info="CSV with columns: id, text"
353
+ )
354
 
355
+ submit_btn = gr.Button("Submit", variant="primary")
356
  output_msg = gr.Textbox(label="Status", interactive=False)
357
  leaderboard_display = gr.DataFrame(
358
  label="Updated Leaderboard",
359
+ value=initial_leaderboard,
360
  interactive=False
361
  )
362
 
 
365
  inputs=[model_name_input, csv_upload],
366
  outputs=[output_msg, leaderboard_display]
367
  )
368
+
369
+ with gr.TabItem("πŸ“ Benchmark Dataset"):
370
+ gr.Markdown(
371
+ """
372
+ ## About the Benchmark Dataset
373
+
374
+ This leaderboard uses the **[sudoping01/bambara-speech-recognition-benchmark](https://huggingface.co/datasets/sudoping01/bambara-speech-recognition-benchmark)** dataset:
375
+
376
+ * Contains diverse Bambara speech samples
377
+ * Includes various speakers, accents, and dialects
378
+ * Covers different speech styles and recording conditions
379
+ * Professionally transcribed and validated
380
+
381
+ ### How to Generate Predictions
382
+
383
+ To submit results to this leaderboard:
384
+
385
+ 1. Download the audio files from the benchmark dataset
386
+ 2. Run your ASR model on the audio files
387
+ 3. Generate a CSV file with 'id' and 'text' columns
388
+ 4. Submit your results using the form in the "Submit New Results" tab
389
+
390
+ ### Evaluation Guidelines
391
+
392
+ * Text is normalized (lowercase, punctuation removed) before metrics calculation
393
+ * Extreme outliers are capped to prevent skewing results
394
+ * All submissions are validated for format and completeness
395
+ """
396
+ )
397
+
398
+ gr.Markdown(
399
+ """
400
+ ---
401
+ ### About MALIBA-AI
402
+
403
+ **MALIBA-AI: Empowering Mali's Future Through Community-Driven AI Innovation**
404
+
405
+ *"No Malian Language Left Behind"*
406
+
407
+ This leaderboard is maintained by the MALIBA-AI initiative to track progress in Bambara speech recognition technology.
408
+ For more information, visit [MALIBA-AI on Hugging Face](https://huggingface.co/MALIBA-AI).
409
+ """
410
+ )
411
 
412
  if __name__ == "__main__":
413
  demo.launch()