Update app.py
Browse files
app.py
CHANGED
@@ -8,19 +8,14 @@ import re
|
|
8 |
|
9 |
from huggingface_hub import login
|
10 |
|
11 |
-
|
12 |
token = os.environ.get("HG_TOKEN")
|
13 |
|
14 |
-
print(f"Token exists: {token is not None}")
|
15 |
-
|
16 |
login(token)
|
17 |
|
18 |
-
# Load the Bambara ASR dataset
|
19 |
print("Loading dataset...")
|
20 |
dataset = load_dataset("sudoping01/bambara-speech-recognition-benchmark", name="default")["eval"]
|
21 |
references = {row["id"]: row["text"] for row in dataset}
|
22 |
|
23 |
-
# Load or initialize the leaderboard
|
24 |
leaderboard_file = "leaderboard.csv"
|
25 |
if not os.path.exists(leaderboard_file):
|
26 |
pd.DataFrame(columns=["submitter", "WER", "CER", "timestamp"]).to_csv(leaderboard_file, index=False)
|
@@ -41,11 +36,11 @@ def normalize_text(text):
|
|
41 |
# Convert to lowercase
|
42 |
text = text.lower()
|
43 |
|
44 |
-
# Remove punctuation, keeping spaces
|
45 |
-
text = re.sub(r'[^\w\s]', '', text)
|
46 |
|
47 |
-
# Normalize whitespace
|
48 |
-
text = re.sub(r'\s+', ' ', text).strip()
|
49 |
|
50 |
return text
|
51 |
|
|
|
8 |
|
9 |
from huggingface_hub import login
|
10 |
|
|
|
11 |
token = os.environ.get("HG_TOKEN")
|
12 |
|
|
|
|
|
13 |
login(token)
|
14 |
|
|
|
15 |
print("Loading dataset...")
|
16 |
dataset = load_dataset("sudoping01/bambara-speech-recognition-benchmark", name="default")["eval"]
|
17 |
references = {row["id"]: row["text"] for row in dataset}
|
18 |
|
|
|
19 |
leaderboard_file = "leaderboard.csv"
|
20 |
if not os.path.exists(leaderboard_file):
|
21 |
pd.DataFrame(columns=["submitter", "WER", "CER", "timestamp"]).to_csv(leaderboard_file, index=False)
|
|
|
36 |
# Convert to lowercase
|
37 |
text = text.lower()
|
38 |
|
39 |
+
# # Remove punctuation, keeping spaces
|
40 |
+
# text = re.sub(r'[^\w\s]', '', text)
|
41 |
|
42 |
+
# # Normalize whitespace
|
43 |
+
# text = re.sub(r'\s+', ' ', text).strip()
|
44 |
|
45 |
return text
|
46 |
|