sudoping01 commited on
Commit
60c60cf
·
verified ·
1 Parent(s): 3efa4cc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -20
app.py CHANGED
@@ -1,17 +1,10 @@
1
  import gradio as gr
2
  import pandas as pd
3
  from datasets import load_dataset
4
- from jiwer import wer, cer, transforms
5
  import os
6
  from datetime import datetime
7
 
8
- # Define text normalization transform
9
- transform = transforms.Compose([
10
- transforms.RemovePunctuation(),
11
- transforms.ToLowerCase(),
12
- transforms.RemoveWhiteSpace(replace_by_space=True),
13
- ])
14
-
15
  # Load the Bambara ASR dataset
16
  dataset = load_dataset("sudoping01/bambara-asr-benchmark", name="default")["train"]
17
  references = {row["id"]: row["text"] for row in dataset}
@@ -21,6 +14,25 @@ leaderboard_file = "leaderboard.csv"
21
  if not os.path.exists(leaderboard_file):
22
  pd.DataFrame(columns=["submitter", "WER", "CER", "timestamp"]).to_csv(leaderboard_file, index=False)
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  def process_submission(submitter_name, csv_file):
25
  try:
26
  # Read and validate the uploaded CSV
@@ -39,22 +51,22 @@ def process_submission(submitter_name, csv_file):
39
  wers, cers = [], []
40
 
41
  for _, row in df.iterrows():
42
- ref = str(references[row["id"]]) # Ensure reference is a string
43
- pred = str(row["text"]) # Ensure prediction is a string
44
 
45
- # Apply transformation directly to the text strings before WER/CER calculation
46
- ref_transformed = " ".join(transform(ref).split())
47
- pred_transformed = " ".join(transform(pred).split())
48
-
49
- # Check if transformation produced valid result
50
- if not ref_transformed or not pred_transformed:
51
- return f"Error: Empty string after transformation for id {row['id']}", None
52
 
53
- # Calculate metrics without transform parameter (we pre-transformed)
54
- wers.append(wer(ref_transformed, pred_transformed))
55
- cers.append(cer(ref_transformed, pred_transformed))
 
56
 
57
  # Compute average WER and CER
 
 
 
58
  avg_wer = sum(wers) / len(wers)
59
  avg_cer = sum(cers) / len(cers)
60
 
 
1
  import gradio as gr
2
  import pandas as pd
3
  from datasets import load_dataset
4
+ from jiwer import wer, cer
5
  import os
6
  from datetime import datetime
7
 
 
 
 
 
 
 
 
8
  # Load the Bambara ASR dataset
9
  dataset = load_dataset("sudoping01/bambara-asr-benchmark", name="default")["train"]
10
  references = {row["id"]: row["text"] for row in dataset}
 
14
  if not os.path.exists(leaderboard_file):
15
  pd.DataFrame(columns=["submitter", "WER", "CER", "timestamp"]).to_csv(leaderboard_file, index=False)
16
 
17
+ def preprocess_text(text):
18
+ """
19
+ Custom text preprocessing to handle Bambara text properly
20
+ """
21
+ # Convert to string in case it's not
22
+ text = str(text)
23
+
24
+ # Remove punctuation
25
+ for punct in [',', '.', '!', '?', ';', ':', '"', "'"]:
26
+ text = text.replace(punct, '')
27
+
28
+ # Convert to lowercase
29
+ text = text.lower()
30
+
31
+ # Normalize whitespace
32
+ text = ' '.join(text.split())
33
+
34
+ return text
35
+
36
  def process_submission(submitter_name, csv_file):
37
  try:
38
  # Read and validate the uploaded CSV
 
51
  wers, cers = [], []
52
 
53
  for _, row in df.iterrows():
54
+ ref = preprocess_text(references[row["id"]])
55
+ pred = preprocess_text(row["text"])
56
 
57
+ # Check if either text is empty after preprocessing
58
+ if not ref or not pred:
59
+ continue
 
 
 
 
60
 
61
+ # Calculate metrics with no transform (we did preprocessing already)
62
+ # This avoids the error with jiwer's transform
63
+ wers.append(wer(ref, pred))
64
+ cers.append(cer(ref, pred))
65
 
66
  # Compute average WER and CER
67
+ if not wers or not cers:
68
+ return "Error: No valid text pairs for evaluation after preprocessing.", None
69
+
70
  avg_wer = sum(wers) / len(wers)
71
  avg_cer = sum(cers) / len(cers)
72