File size: 14,395 Bytes
9ba8fab
 
 
3769468
9ba8fab
 
3769468
6960dc6
da12542
 
f81f1e2
da12542
f81f1e2
 
 
 
da12542
 
ddc83ff
f81f1e2
 
 
 
 
 
 
 
 
9ba8fab
f81f1e2
9ba8fab
 
f81f1e2
ddc83ff
 
3769468
 
 
f81f1e2
3769468
 
 
d415750
3769468
ddc83ff
f81f1e2
 
ddc83ff
f81f1e2
 
ddc83ff
3769468
 
 
f81f1e2
 
 
 
 
 
 
d415750
f81f1e2
3769468
 
 
ddc83ff
3769468
d415750
3769468
 
d415750
3769468
ddc83ff
3769468
d415750
 
f81f1e2
 
 
 
 
 
d415750
f81f1e2
 
 
 
 
ddc83ff
d415750
3769468
ddc83ff
3769468
 
ddc83ff
f81f1e2
 
 
 
 
 
 
 
 
ddc83ff
f81f1e2
 
3769468
d415750
 
f81f1e2
 
3769468
 
 
d415750
 
 
f81f1e2
d415750
ddc83ff
f81f1e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9ba8fab
 
f81f1e2
 
 
 
9ba8fab
f81f1e2
 
 
 
ddc83ff
9ba8fab
ddc83ff
d415750
f81f1e2
3769468
f81f1e2
 
 
 
d415750
29c8f24
f81f1e2
d415750
9ba8fab
d415750
f81f1e2
 
 
 
 
 
 
 
ddc83ff
 
3769468
 
d415750
3769468
f81f1e2
d415750
3769468
f81f1e2
 
 
 
 
 
 
6960dc6
f81f1e2
 
 
 
 
d415750
f81f1e2
d415750
ddc83ff
d415750
f81f1e2
d415750
 
ddc83ff
f81f1e2
ddc83ff
f81f1e2
d415750
 
ddc83ff
f81f1e2
9ba8fab
ddc83ff
9ba8fab
 
f81f1e2
9ba8fab
f81f1e2
 
9ba8fab
 
f81f1e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3769468
9ba8fab
ddc83ff
f81f1e2
c726970
ddc83ff
d415750
9ba8fab
 
d415750
f81f1e2
d415750
 
f81f1e2
 
 
 
 
 
9ba8fab
 
d415750
9ba8fab
f81f1e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d415750
9ba8fab
 
 
 
 
 
ddc83ff
d415750
 
ddc83ff
3769468
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
import gradio as gr
import pandas as pd
from datasets import load_dataset
from jiwer import wer, cer
import os
from datetime import datetime
import re

from huggingface_hub import login

# Authentication setup
token = os.environ.get("HG_TOKEN")
print(f"Token exists: {token is not None}")
if token:
    print(f"Token length: {len(token)}")
    print(f"Token first few chars: {token[:4]}...")
login(token)

print("Loading dataset...")
try:
    dataset = load_dataset("sudoping01/bambara-speech-recognition-benchmark", name="default", use_auth_token=token)["eval"]
    print(f"Successfully loaded dataset with {len(dataset)} samples")
    references = {row["id"]: row["text"] for row in dataset}
except Exception as e:
    print(f"Error loading dataset: {str(e)}")
    # Fallback in case dataset can't be loaded
    references = {}
    print("WARNING: Using empty references dictionary due to dataset loading error")

# Initialize leaderboard file
leaderboard_file = "leaderboard.csv"
if not os.path.exists(leaderboard_file):
    pd.DataFrame(columns=["submitter", "WER", "CER", "weighted_WER", "weighted_CER", "samples_evaluated", "timestamp"]).to_csv(leaderboard_file, index=False)
else:
    print(f"Loaded existing leaderboard with {len(pd.read_csv(leaderboard_file))} entries")

def normalize_text(text):
    """
    Normalize text by converting to lowercase, removing punctuation, and normalizing whitespace.
    """
    if not isinstance(text, str):
        text = str(text)
    
    text = text.lower()
    
    # Remove punctuation, keeping spaces
    text = re.sub(r'[^\w\s]', '', text)
    
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

def calculate_metrics(predictions_df):
    """
    Calculate WER and CER for each sample and return averages and per-sample results.
    Uses both standard average and length-weighted average.
    """
    per_sample_metrics = []
    total_ref_words = 0
    total_ref_chars = 0

    # Process each sample
    for _, row in predictions_df.iterrows():
        id_val = row["id"]
        if id_val not in references:
            print(f"Warning: ID {id_val} not found in references")
            continue
            
        reference = normalize_text(references[id_val])
        hypothesis = normalize_text(row["text"])
        
        if not reference or not hypothesis:
            print(f"Warning: Empty reference or hypothesis for ID {id_val}")
            continue
            
        reference_words = reference.split()
        reference_chars = list(reference)
        
        # Skip very short references for more stable metrics
        if len(reference_words) < 2:
            print(f"Warning: Reference too short for ID {id_val}, skipping")
            continue
        
        # Store sample info for debugging (first few samples)
        if len(per_sample_metrics) < 5:
            print(f"ID: {id_val}")
            print(f"Reference: '{reference}'")
            print(f"Hypothesis: '{hypothesis}'")
            print(f"Reference words: {reference_words}")
        
        try:
            # Calculate WER and CER
            sample_wer = wer(reference, hypothesis)
            sample_cer = cer(reference, hypothesis)
            
            # Cap metrics at sensible values to prevent outliers
            sample_wer = min(sample_wer, 2.0)  # Cap at 200% WER
            sample_cer = min(sample_cer, 2.0)  # Cap at 200% CER
            
            # For weighted calculations
            total_ref_words += len(reference_words)
            total_ref_chars += len(reference_chars)
            
            if len(per_sample_metrics) < 5:
                print(f"WER: {sample_wer}, CER: {sample_cer}")
            
            per_sample_metrics.append({
                "id": id_val,
                "reference": reference,
                "hypothesis": hypothesis,
                "ref_word_count": len(reference_words),
                "ref_char_count": len(reference_chars),
                "wer": sample_wer,
                "cer": sample_cer
            })
        except Exception as e:
            print(f"Error calculating metrics for ID {id_val}: {str(e)}")
    
    if not per_sample_metrics:
        raise ValueError("No valid samples for WER/CER calculation")
    
    # Calculate standard average metrics
    avg_wer = sum(item["wer"] for item in per_sample_metrics) / len(per_sample_metrics)
    avg_cer = sum(item["cer"] for item in per_sample_metrics) / len(per_sample_metrics)
    
    # Calculate weighted average metrics based on reference length
    weighted_wer = sum(item["wer"] * item["ref_word_count"] for item in per_sample_metrics) / total_ref_words
    weighted_cer = sum(item["cer"] * item["ref_char_count"] for item in per_sample_metrics) / total_ref_chars
    
    print(f"Simple average WER: {avg_wer:.4f}, CER: {avg_cer:.4f}")
    print(f"Weighted average WER: {weighted_wer:.4f}, CER: {weighted_cer:.4f}")
    print(f"Processed {len(per_sample_metrics)} valid samples")
    
    return avg_wer, avg_cer, weighted_wer, weighted_cer, per_sample_metrics

def styled_error(message):
    """Format error messages with red styling"""
    return f"<div style='color: red; font-weight: bold; padding: 10px; border-radius: 5px; background-color: #ffe0e0;'>{message}</div>"

def styled_success(message):
    """Format success messages with green styling"""
    return f"<div style='color: green; font-weight: bold; padding: 10px; border-radius: 5px; background-color: #e0ffe0;'>{message}</div>"

def styled_info(message):
    """Format informational messages with blue styling"""
    return f"<div style='color: #004080; padding: 10px; border-radius: 5px; background-color: #e0f0ff;'>{message}</div>"

def process_submission(submitter_name, csv_file):
    """
    Process a submission CSV, calculate metrics, and update the leaderboard.
    Returns a status message and updated leaderboard.
    """
    try:
        # Validate submitter name
        if not submitter_name or len(submitter_name.strip()) < 3:
            return styled_error("Please provide a valid submitter name (at least 3 characters)"), None
        
        # Read and validate the uploaded CSV
        df = pd.read_csv(csv_file)
        print(f"Processing submission from {submitter_name} with {len(df)} rows")
        
        # Basic validation
        if len(df) == 0:
            return styled_error("Error: Uploaded CSV is empty."), None
        
        if len(df) < 10:
            return styled_error("Error: Submission contains too few samples (minimum 10 required)."), None
            
        if set(df.columns) != {"id", "text"}:
            return styled_error(f"Error: CSV must contain exactly 'id' and 'text' columns. Found: {', '.join(df.columns)}"), None
            
        if df["id"].duplicated().any():
            dup_ids = df[df["id"].duplicated()]["id"].unique()
            return styled_error(f"Error: Duplicate IDs found: {', '.join(map(str, dup_ids[:5]))}."), None

        # Ensure text column contains strings
        df["text"] = df["text"].astype(str)

        # Check for valid references
        if not references:
            return styled_error("Error: Reference dataset could not be loaded. Please try again later."), None

        # Check if IDs match the reference dataset
        missing_ids = set(references.keys()) - set(df["id"])
        extra_ids = set(df["id"]) - set(references.keys())
        
        if missing_ids:
            return styled_error(f"Error: Missing {len(missing_ids)} IDs in submission. First few missing: {', '.join(map(str, list(missing_ids)[:5]))}."), None
            
        if extra_ids:
            return styled_error(f"Error: Found {len(extra_ids)} extra IDs not in reference dataset. First few extra: {', '.join(map(str, list(extra_ids)[:5]))}."), None
        
        # Check for suspicious submissions (high percentage of exact matches)
        exact_matches = 0
        for _, row in df.iterrows():
            if normalize_text(row["text"]) == normalize_text(references[row["id"]]):
                exact_matches += 1
        
        exact_match_ratio = exact_matches / len(df)
        if exact_match_ratio > 0.95:  # If 95% exact matches, likely copying reference
            return styled_error("Suspicious submission: Too many exact matches with reference texts."), None
        
        # Calculate metrics
        try:
            avg_wer, avg_cer, weighted_wer, weighted_cer, detailed_results = calculate_metrics(df)
            
            # Debug information
            print(f"Calculated metrics - WER: {avg_wer:.4f}, CER: {avg_cer:.4f}")
            print(f"Weighted metrics - WER: {weighted_wer:.4f}, CER: {weighted_cer:.4f}")
            print(f"Processed {len(detailed_results)} valid samples")
            
            # Check for suspiciously low values
            if avg_wer < 0.001 or weighted_wer < 0.001:
                print("WARNING: WER is extremely low - likely an error")
                return styled_error("Error: WER calculation yielded suspicious results (near-zero). Please check your submission CSV."), None
                
        except Exception as e:
            print(f"Error in metrics calculation: {str(e)}")
            return styled_error(f"Error calculating metrics: {str(e)}"), None
        
        # Update the leaderboard
        leaderboard = pd.read_csv(leaderboard_file)
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        
        new_entry = pd.DataFrame(
            [[submitter_name, avg_wer, avg_cer, weighted_wer, weighted_cer, len(detailed_results), timestamp]],
            columns=["submitter", "WER", "CER", "weighted_WER", "weighted_CER", "samples_evaluated", "timestamp"]
        )
        
        # Combine with existing leaderboard and keep only the best submission per submitter
        combined = pd.concat([leaderboard, new_entry])
        # Sort by WER (ascending) and get first entry for each submitter
        best_entries = combined.sort_values("WER").groupby("submitter").first().reset_index()
        # Sort the resulting dataframe by WER
        updated_leaderboard = best_entries.sort_values("WER")
        updated_leaderboard.to_csv(leaderboard_file, index=False)
        
        # Create detailed metrics summary
        metrics_summary = f"""
        <h3>Submission Results</h3>
        <table>
            <tr><td><b>Submitter:</b></td><td>{submitter_name}</td></tr>
            <tr><td><b>Word Error Rate (WER):</b></td><td>{avg_wer:.4f}</td></tr>
            <tr><td><b>Character Error Rate (CER):</b></td><td>{avg_cer:.4f}</td></tr>
            <tr><td><b>Weighted WER:</b></td><td>{weighted_wer:.4f}</td></tr>
            <tr><td><b>Weighted CER:</b></td><td>{weighted_cer:.4f}</td></tr>
            <tr><td><b>Samples Evaluated:</b></td><td>{len(detailed_results)}</td></tr>
            <tr><td><b>Submission Time:</b></td><td>{timestamp}</td></tr>
        </table>
        """
        
        return styled_success(f"Submission processed successfully!") + styled_info(metrics_summary), updated_leaderboard
        
    except Exception as e:
        print(f"Error processing submission: {str(e)}")
        return styled_error(f"Error processing submission: {str(e)}"), None

# Create the Gradio interface
with gr.Blocks(title="Bambara ASR Leaderboard") as demo:
    gr.Markdown(
        """
        # Bambara ASR Leaderboard
        
        Upload a CSV file with 'id' and 'text' columns to evaluate your ASR predictions.
        The 'id's must match those in the dataset.
        
        ## Metrics
        - **WER**: Word Error Rate (lower is better) - measures word-level accuracy
        - **CER**: Character Error Rate (lower is better) - measures character-level accuracy
        
        We report both standard averages and length-weighted averages (where longer samples have more influence on the final score).
        """
    )
    
    with gr.Row():
        with gr.Column(scale=1):
            submitter = gr.Textbox(
                label="Submitter Name or Model Name", 
                placeholder="e.g., MALIBA-AI/asr",
                info="Name to appear on the leaderboard"
            )
            csv_upload = gr.File(
                label="Upload CSV File", 
                file_types=[".csv"],
                info="CSV must have 'id' and 'text' columns"
            )
            submit_btn = gr.Button("Submit", variant="primary")
            
        with gr.Column(scale=2):
            with gr.Accordion("Submission Format", open=False):
                gr.Markdown(
                    """
                    ### CSV Format Requirements
                    
                    Your CSV file must:
                    - Have exactly two columns: `id` and `text`
                    - The `id` column must match the IDs in the reference dataset
                    - The `text` column should contain your model's transcriptions
                    
                    Example:
                    ```
                    id,text
                    audio_001,n ye foro ka taa
                    audio_002,i ni ce
                    ```
                    
                    ### Evaluation Process
                    
                    Your submissions are evaluated by:
                    1. Normalizing both reference and predicted text (lowercase, punctuation removal)
                    2. Calculating Word Error Rate (WER) and Character Error Rate (CER)
                    3. Computing both simple average and length-weighted average
                    4. Ranking on the leaderboard by WER (lower is better)
                    
                    Only your best submission is kept on the leaderboard.
                    """
                )
    
    output_msg = gr.HTML(label="Status")
    
    # Leaderboard display
    with gr.Accordion("Leaderboard", open=True):
        leaderboard_display = gr.DataFrame(
            label="Current Standings",
            value=pd.read_csv(leaderboard_file),
            interactive=False
        )
    
    submit_btn.click(
        fn=process_submission,
        inputs=[submitter, csv_upload],
        outputs=[output_msg, leaderboard_display]
    )

# Print startup message
print("Starting Bambara ASR Leaderboard app...")

# Launch the app
if __name__ == "__main__":
    demo.launch(share=True)