Spaces:
Runtime error
Runtime error
import torch | |
import soundfile as sf | |
import gradio as gr | |
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech | |
from transformers import SpeechT5HifiGan | |
from datasets import load_dataset | |
from IPython.display import Audio | |
import numpy as np | |
model_name = "trangiabao17032000/final_tts" | |
#processor | |
processor = SpeechT5Processor.from_pretrained(model_name) | |
tokenizer = processor.tokenizer | |
#model | |
model = SpeechT5ForTextToSpeech.from_pretrained(model_name) | |
model.resize_token_embeddings(len(tokenizer)) | |
model.eval() | |
#vocoder | |
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") | |
vocoder.eval() | |
#speaker embedding | |
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") | |
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) | |
#cleaner text | |
def convert_string_to_numbers(input_str): | |
try: | |
# Replace comma with period and attempt to convert the string to a float | |
num = float(input_str.replace(',', '.')) | |
if num.is_integer(): | |
return int(num) | |
return num | |
except ValueError: | |
# If it's not a valid float, check if it's an integer or a negative integer | |
if input_str.replace('.', '', 1).isdigit(): # Remove one dot for checking integers | |
return int(input_str.replace(',', '')) | |
elif input_str[0] == '-' and input_str[1:].replace('.', '', 1).isdigit(): | |
return int(input_str.replace(',', '')) | |
else: | |
raise ValueError("Invalid input: couldn't convert to a number") | |
def number_to_vietnamese_words(number): | |
ones = ['', 'một', 'hai', 'ba', 'bốn', 'năm', 'sáu', 'bảy', 'tám', 'chín'] | |
tens = ['', 'mười', 'hai mươi', 'ba mươi', 'bốn mươi', 'năm mươi', 'sáu mươi', 'bảy mươi', 'tám mươi', 'chín mươi'] | |
hundreds = ['', 'một trăm', 'Hai trăm', 'ba trăm', 'bốn trăm', 'năm trăm', 'sáu trăm', 'bảy trăm', 'tám trăm', 'chín trăm'] | |
thousands = [''] + ['nghìn', 'triệu', 'tỷ'] | |
def words(n): | |
if n < 10: | |
return ones[n] | |
elif n < 20: | |
return tens[n//10] + " " + words(n % 10) | |
elif n < 100: | |
return tens[n // 10] + ('' if n % 10 == 0 else ' ' + ones[n % 10]) | |
else: | |
return hundreds[n // 100] + ('' if n % 100 == 0 else (' lẻ ' if n % 100 < 10 else ' ') + words(n % 100)) | |
if number == 0: | |
return 'không' | |
integer_part = int(number) | |
decimal_part = round((number - integer_part) * 100) # Round the decimal part to 2 decimal places | |
result = [] | |
i = 0 | |
while integer_part > 0: | |
if integer_part % 1000 != 0: | |
result.append(words(integer_part % 1000) + (' ' + thousands[i] if i > 0 else '')) | |
integer_part //= 1000 | |
i += 1 | |
result_integer = ' '.join(result[::-1]) | |
result_decimal = '' | |
if decimal_part > 0: | |
result_decimal = ' phẩy' | |
for digit in str(decimal_part): | |
result_decimal += ' ' + ones[int(digit)] | |
return result_integer + result_decimal | |
def is_num(string): | |
try: | |
float(string) | |
except ValueError: | |
return False | |
return True | |
def normalize(input): | |
input = input.lower() | |
newstr = map(lambda x: number_to_vietnamese_words(convert_string_to_numbers(x)) if is_num(x) else x, input.split(" ")) | |
return ' '.join(newstr) | |
def split_paragraph_into_sentences(paragraph, max_chars = 300): | |
sentences = [] | |
words = paragraph.split() | |
current_sentence = words[0] | |
for word in words[1:]: | |
if len(current_sentence) + len(word) + 1 <= max_chars: | |
current_sentence += ' ' + word | |
else: | |
sentences.append(current_sentence) | |
current_sentence = word | |
if current_sentence: | |
sentences.append(current_sentence) | |
return sentences | |
# generator speech | |
def text_to_speech(paragraph): | |
try: | |
paragraph = normalize(paragraph) | |
except: | |
paragraph = paragraph.lower() | |
list_sentence = split_paragraph_into_sentences(paragraph) | |
final_speech = np.array([]) | |
for sentence in list_sentence: | |
inputs = processor(text=sentence, return_tensors="pt") | |
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings,vocoder=vocoder) | |
final_speech = np.concatenate((final_speech, speech.numpy())) | |
sf.write("tts_example.wav", final_speech, samplerate=16000) | |
return "tts_example.wav" | |
tts_examples = [ | |
"xin chào mọi người, đây là sản phẩm thử nghiệm cho tiếng việt.", | |
"Mình sẽ tổ chức sinh nhật vào thứ 6 ngày 7 tháng này", | |
] | |
#gradio interface | |
iface = gr.Interface( | |
fn=text_to_speech, | |
inputs=gr.Textbox(), | |
outputs=gr.Audio(), | |
title="Text-to-Speech", | |
examples=tts_examples, | |
description="Give me something to say!", | |
) | |
iface.launch() |