File size: 3,845 Bytes
5a8d3bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f0f8536
 
5a8d3bb
79d84e7
3afcae0
 
79d84e7
edc8e2a
5a8d3bb
 
 
 
 
 
 
 
 
 
 
 
8cef93c
5a8d3bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3afcae0
 
5a8d3bb
 
 
 
8c15375
 
5a8d3bb
 
 
 
 
f0f8536
5a8d3bb
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import os
import tempfile
import torch
import numpy as np
import gradio as gr
import scipy.io.wavfile as wavfile
from pydub import AudioSegment
from transformers import VitsModel, AutoTokenizer

# ---------- Configuration --------------------------------------------------
# Define available TTS models here. Add new entries as needed.
TTS_MODELS = {
    "Swahili": {
        "tokenizer": "FarmerlineML/swahili-tts-2025",
        "checkpoint": "FarmerlineML/Swahili-tts-2025_part4"
    },
    "Krio": {
        "tokenizer": "FarmerlineML/Krio-TTS",
        "checkpoint": "FarmerlineML/Krio-TTS"
    },
    "Ewe": {
        "tokenizer": "FarmerlineML/Ewe-tts-2025_v2",
        "checkpoint": "FarmerlineML/Ewe-tts-2025_v2"
    },
    
}

device = "cuda" if torch.cuda.is_available() else "cpu"

# ---------- Load all models & tokenizers -----------------------------------
models = {}
tokenizers = {}
for name, paths in TTS_MODELS.items():
    print(f"Loading {name} model...")
    model = VitsModel.from_pretrained(paths["checkpoint"]).to(device)
    model.eval()
    # Apply clear-speech inference parameters (tweak per model if desired)
    model.noise_scale = 0.8
    model.noise_scale_duration = 0.667
    model.speaking_rate = 0.75
    models[name] = model
    tokenizers[name] = AutoTokenizer.from_pretrained(paths["tokenizer"])

# ---------- Utility: WAV ➔ MP3 Conversion -----------------------------------
def _wav_to_mp3(wave_np: np.ndarray, sr: int) -> str:
    """Convert int16 numpy waveform to an MP3 temp file, return its path."""
    # Ensure int16 for pydub
    if wave_np.dtype != np.int16:
        wave_np = (wave_np * 32767).astype(np.int16)

    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tf:
        wavfile.write(tf.name, sr, wave_np)
        wav_path = tf.name

    mp3_path = wav_path.replace(".wav", ".mp3")
    AudioSegment.from_wav(wav_path).export(mp3_path, format="mp3", bitrate="64k")
    os.remove(wav_path)
    return mp3_path

# ---------- TTS Generation ---------------------------------------------------
def tts_generate(model_name: str, text: str):
    """Generate speech for `text` using the selected model."""
    if not text:
        return None
    model = models[model_name]
    tokenizer = tokenizers[model_name]
    inputs = tokenizer(text, return_tensors="pt").to(device)
    with torch.no_grad():
        wave = model(**inputs).waveform[0].cpu().numpy()
    return _wav_to_mp3(wave, model.config.sampling_rate)

# ---------- Gradio Interface ------------------------------------------------
examples = [
    ["Ewe", "Wotsɔ ketrifɔ mlɔ xɔ ŋu. ɖeviwo nɔ ketrifɔ ŋu. ɖeviawo ƒe gbɔsɔsɔ me anɔ abe enyi. fi si ɖeviwo le la ƒo ɖi. ɖeviawo kɔ nu kake aɖewo ɖe asi ɖewo hā nɔ wonuiwo kplɔm."],
    ["Ewe", "amewo le yɔƒe me eye aɖake le wogbɔ. wodo awu yibɔ ŋutsu aɖe le kponyi fam le akɔ fam ne nyɔnu aɖe."],
    ["Swahili", "zao kusaidia kuondoa umaskini na kujenga kampeni za mwamko wa virusi vya ukimwi amezitembelea"],
    ["Swahili", "Kidole hiki ni tofauti na vidole vingine kwa sababu mwelekeo wake ni wa pekee."],
    ["Swahili", "Tafadhali hakikisha umefunga mlango kabla ya kuondoka."],
    ["Krio", "Wetin na yu nem?"],
    ["Krio", "aw yu de du"],
    ["Krio", "A de go skul"],
]

demo = gr.Interface(
    fn=tts_generate,
    inputs=[
        gr.Dropdown(choices=list(TTS_MODELS.keys()), value="Swahili", label="Choose TTS Model"),
        gr.Textbox(lines=3, placeholder="Enter text here", label="Input Text")
    ],
    outputs=gr.Audio(type="filepath", label="Audio", autoplay=True),
    title="Multi‐Model Text-to-Speech",
    description=(
        "Select a TTS model from the dropdown and enter text to generate speech."
    ),
    examples=examples,
    cache_examples=True,
)

if __name__ == "__main__":
    demo.launch()