File size: 5,494 Bytes
2316ebf
4868e2d
a38745d
4868e2d
 
 
 
 
79ddbe0
 
 
 
 
4868e2d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8a9e734
4868e2d
 
 
 
8a9e734
4868e2d
 
 
 
79ddbe0
 
4868e2d
 
 
4a1c4a6
4868e2d
 
0631ef3
 
 
 
 
 
 
 
4868e2d
b88a508
 
d06414f
 
b88a508
4868e2d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79ddbe0
4868e2d
 
 
 
 
 
 
2316ebf
 
4868e2d
ea22968
4868e2d
2316ebf
4868e2d
 
79ddbe0
 
4868e2d
2316ebf
 
4868e2d
 
 
 
 
 
0631ef3
 
 
 
 
 
 
 
4868e2d
2316ebf
4868e2d
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import gradio as gr

LAST_UPDATED = "06/12/2024"

####################################
# Datos estáticos del leaderboard
####################################
leaderboard_data = [
    {'name': 'StyleTTS 2', 'PESQ': 3.921, 'WER': 0.162, 'UTMOS': 2.42, 'SpeechBERT': 0, 'Logf0': 0},
    {'name': 'Matxa-TTS', 'PESQ': 3.539, 'WER': 0.179, 'UTMOS': 3.50, 'SpeechBERT': 0, 'Logf0': 0},
    {'name': 'Matxa-TTS-multiaccent', 'PESQ': 3.415, 'WER': 0.242, 'UTMOS': 2.98, 'SpeechBERT': 0, 'Logf0': 0},
    {'name': 'StableTTS', 'PESQ': 3.643, 'WER': 0.164, 'UTMOS': 2.62, 'SpeechBERT': 0.7837, 'Logf0': 0.3831},
    {'name': 'Vits 2', 'PESQ': 0, 'WER': 0, 'UTMOS': 3.61, 'SpeechBERT': 0, 'Logf0': 0},
]


# Texto para la pestaña de métricas
METRICS_TAB_TEXT = """
## Metrics
Models in the leaderboard are evaluated using several key metrics: 
* **UTMOS** (UTokyo-SaruLab Mean Opinion Score),
* **WER** (Word Error Rate),
* **PESQ** (Perceptual Evaluation of Speech Quality).
These metrics help evaluate both the accuracy and quality of the model.
### UTMOS (UTokyo-SaruLab Mean Opinion Score)[[Paper](https://arxiv.org/abs/2204.02152)]
UTMOS is a MOS prediction system. **A higher UTMOS indicates better quality** of the generated voice.
### WER (Word Error Rate)
WER is a common metric for evaluating speech recognition systems. It measures the percentage of words in the generated transcript that differ from the reference (correct) transcript. **A lower WER value indicates higher accuracy**.
Example:

| Reference   | the  | cat | sat     | on  | the  | mat |
|-------------|------|-----|---------|-----|------|-----|
| Prediction  | the  | cat | **sit** | on  | the  |     |
| Label       | ✅   | ✅  | S       | ✅  | ✅   | D   |

The WER calculation is done as follows:
```
WER = (S + I + D) / N = (1 + 0 + 1) / 6 = 0.333
```
Moreover, We calculate the WER using the STT_Ca_Citrinet_512 model. [[Link](https://langtech-bsc.gitbook.io/aina-kit/aina-hack/automatic-speech-recognition)]

### PESQ (Perceptual Evaluation of Speech Quality)[[Paper](https://ieeexplore.ieee.org/abstract/document/941023?casa_token=jdtHy84_KhQAAAAA:qHN3WbT6cNdufj6OOn_fn0Je0RedMv-WJCmhQ_3CWy4nMTuDvFMF3KstAmKqLx5suQwdPgGByoY)]
PESQ is a perceptual metric that evaluates the quality of speech in a similar manner to how a human listener would. **A higher PESQ indicates better voice quality**.
## Benchmark Datasets
Model performance is evaluated using [our test datasets](https://huggingface.co/spaces/projecte-aina/catalan_tts_arena/blob/main/catalan_benchmark_v1.txt). These datasets cover a variety of domains and acoustic conditions, ensuring a robust evaluation.
"""

CITATION_TEXT = """@misc{catalan-tts-arena,
	title        = {Catalan Texto-to-Speech Leaderboard},
	author       = {Rodolfo Zevallos, José Giraldo, Alex Peiró-Lilja, Carme Armentano-Oller},
	year         = 2024,
	publisher    = {Hugging Face},
	howpublished = "\\url{https://huggingface.co/spaces/projecte-aina/catalan_tts_arena}"
}
"""

DESCR = """
# 🏆 Catalan TTS Arena: Benchmarking TTS Models
\nThe Catalan TTS Leaderboard ranks and evaluates TTS models in Catalan.
\nThe leaderboard currently focuses on Catalan TTS, and will be expanded to multilingual evaluation in later versions.
""".strip()

####################################
# Functions (static version)
####################################

def get_leaderboard():
    """
    Retorna el leaderboard en orden descendente por PESQ y luego por UTMOS.
    """
    # Ordenar primero por PESQ (calidad del habla) y luego por UTMOS (calidad percibida)
    sorted_leaderboard = sorted(leaderboard_data, key=lambda x: (x['UTMOS']), reverse=True)
    
    # Asignar el rank basado en el orden por PESQ
    for rank, model in enumerate(sorted_leaderboard):
        model['rank'] = rank + 1  # rank es la posición en la lista (1-indexed)
    
    return [[model['rank'], model['name'], model['UTMOS'], model['WER'], model['PESQ'], model['SpeechBERT'], model['Logf0']] for model in sorted_leaderboard]

####################################
# Interfaz con Gradio
####################################

theme = gr.themes.Base(
    font=[gr.themes.GoogleFont('Libre Franklin'), gr.themes.GoogleFont('Public Sans'), 'system-ui', 'sans-serif'],
)

with gr.Blocks(theme=theme) as demo:
    gr.Markdown(DESCR, elem_classes="markdown-text")
    
    with gr.Tabs(elem_classes="tab-buttons") as tabs:
        with gr.TabItem("🏅 Leaderboard", elem_id="od-benchmark-tab-table", id=0):
            leaderboard_table = gr.DataFrame(
                headers=["Rank", "Model", "UTMOS ⬆️️", "WER ⬇️", "PESQ", "SpeechBERT ⬆️", "Logf0 ⬆️"], 
                datatype=["str", "str", "str", "str", "str", "str", "str"], 
                value=get_leaderboard()  # Carga los datos iniciales de la tabla
            )

        with gr.TabItem("📈 Metrics", elem_id="od-benchmark-tab-table", id=1):
            gr.Markdown(METRICS_TAB_TEXT, elem_classes="markdown-text")

        
    gr.Markdown(f"Last updated on **{LAST_UPDATED}**", elem_classes="markdown-text")

    with gr.Row():
        with gr.Accordion("📙 Citation", open=False):
            gr.Textbox(
                value=CITATION_TEXT, lines=7,
                label="Copy the BibTeX snippet to cite this source",
                elem_id="citation-button",
                show_copy_button=True,
            )


# Lanzar la aplicación
demo.queue(api_open=False, default_concurrency_limit=40).launch(show_api=False)