Spaces:
Running
Running
import gradio as gr | |
import torch | |
import soundfile as sf | |
import spaces | |
import os | |
import numpy as np | |
import re | |
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan | |
from speechbrain.pretrained import EncoderClassifier | |
from datasets import load_dataset | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
def load_models_and_data(): | |
model_name = "microsoft/speecht5_tts" | |
processor = SpeechT5Processor.from_pretrained(model_name) | |
model = SpeechT5ForTextToSpeech.from_pretrained("speecht5_finetuned_Aumkesh_tr").to(device) | |
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device) | |
spk_model_name = "speechbrain/spkrec-xvect-voxceleb" | |
speaker_model = EncoderClassifier.from_hparams( | |
source=spk_model_name, | |
run_opts={"device": device}, | |
savedir=os.path.join("/tmp", spk_model_name), | |
) | |
# Load a sample from a dataset for default embedding | |
dataset = load_dataset("Yassmen/TTS_English_Technical_data", split="train") | |
example = dataset[304] | |
return model, processor, vocoder, speaker_model, example | |
model, processor, vocoder, speaker_model, default_example = load_models_and_data() | |
def create_speaker_embedding(waveform): | |
with torch.no_grad(): | |
speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform).unsqueeze(0).to(device)) | |
speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2) | |
speaker_embeddings = speaker_embeddings.squeeze() | |
return speaker_embeddings | |
def prepare_default_embedding(example): | |
audio = example["audio"] | |
return create_speaker_embedding(audio["array"]) | |
default_embedding = prepare_default_embedding(default_example) | |
replacements = [ | |
('API', 'A-P-I'), | |
('CUDA', 'Coo-da'), | |
('ChatGPT', 'Chat-G-P-T'), | |
('HTTP', 'H-T-T-P'), | |
('JSON', 'J-S-O-N'), | |
('GPU', 'G-P-U'), | |
('RAM', 'R-A-M'), | |
('CPU', 'C-P-U'), | |
('SQL', 'S-Q-L'), | |
('NLP', 'N-L-P'), | |
('PyTorch', 'Pie-torch'), | |
('TensorFlow', 'Ten-sor-flow'), | |
('SaaS', 'SaaS'), | |
('GitHub', 'Git-Hub'), | |
('Docker', 'Dock-er'), | |
('Kubernetes', 'Koo-ber-net-ees'), | |
('OpenAI', 'Open-A-I'), | |
('IOT', 'I-O-T'), | |
('Linux', 'Li-nux'), | |
] | |
number_words = { | |
0: "zero", 1: "one", 2: "two", 3: "tree", 4: "four", 5: "five", 6: "six", 7: "seven", 8: "eight", 9: "nine", | |
10: "ten", 11: "eleven", 12: "twelve", 13: "thirteen", 14: "fourteen", 15: "fifteen", 16: "sixteen", 17: "seventeen", | |
18: "eighteen", 19: "nineteen", 20: "tweenty", 30: "thirty", 40: "forty", 50: "fifty", 60: "sixty", 70: "seventy", | |
80: "eighty", 90: "ninty", 100: "hundred", 1000: "thousand" | |
} | |
def number_to_words(number): | |
if number < 20: | |
return number_words[number] | |
elif number < 100: | |
tens, unit = divmod(number, 10) | |
return number_words[tens * 10] + (" " + number_words[unit] if unit else "") | |
elif number < 1000: | |
hundreds, remainder = divmod(number, 100) | |
return (number_words[hundreds] + " hundred" if hundreds > 1 else " hundred") + (" " + number_to_words(remainder) if remainder else "") | |
elif number < 1000000: | |
thousands, remainder = divmod(number, 1000) | |
return (number_to_words(thousands) + " thousand" if thousands > 1 else " thousand") + (" " + number_to_words(remainder) if remainder else "") | |
elif number < 1000000000: | |
millions, remainder = divmod(number, 1000000) | |
return number_to_words(millions) + " million" + (" " + number_to_words(remainder) if remainder else "") | |
elif number < 1000000000000: | |
billions, remainder = divmod(number, 1000000000) | |
return number_to_words(billions) + " billion" + (" " + number_to_words(remainder) if remainder else "") | |
else: | |
return str(number) | |
def replace_numbers_with_words(text): | |
def replace(match): | |
number = int(match.group()) | |
return number_to_words(number) | |
# Find the numbers and change with words. | |
result = re.sub(r'\b\d+\b', replace, text) | |
return result | |
def replace_numbers_with_words(text): | |
def replace(match): | |
number = int(match.group()) | |
return number_to_words(number) | |
# Find the numbers and change with words. | |
result = re.sub(r'\b\d+\b', replace, text) | |
return result | |
def normalize_text(text): | |
# Convert to lowercase | |
text = text.lower() | |
# Replace numbers with words | |
text = replace_numbers_with_words(text) | |
# Apply character replacements | |
for old, new in replacements: | |
text = text.replace(old, new) | |
# Remove punctuation | |
text = re.sub(r'[^\w\s]', '', text) | |
return text | |
def text_to_speech(text, audio_file=None): | |
# Normalize the input text | |
normalized_text = normalize_text(text) | |
# Prepare the input for the model | |
inputs = processor(text=normalized_text, return_tensors="pt").to(device) | |
# Use the default speaker embedding | |
speaker_embeddings = default_embedding | |
# Generate speech | |
with torch.no_grad(): | |
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings.unsqueeze(0), vocoder=vocoder) | |
speech_np = speech.cpu().numpy() | |
return (16000, speech_np) | |
iface = gr.Interface( | |
fn=text_to_speech, | |
inputs=[ | |
gr.Textbox(label="Enter Turkish text to convert to speech") | |
], | |
outputs=[ | |
gr.Audio(label="Generated Speech", type="numpy") | |
], | |
title="English SpeechT5 Text-to-Speech Demo", | |
description="Enter English text, and listen to the generated speech." | |
) | |
iface.launch(share=True) |