final_tts_mix / app.py
TranGiaBao
Duplicate from trangiabao17032000/final_tts
77852b1
raw
history blame
4.91 kB
import torch
import soundfile as sf
import gradio as gr
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech
from transformers import SpeechT5HifiGan
from datasets import load_dataset
from IPython.display import Audio
import numpy as np
model_name = "trangiabao17032000/final_tts"
#processor
processor = SpeechT5Processor.from_pretrained(model_name)
tokenizer = processor.tokenizer
#model
model = SpeechT5ForTextToSpeech.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))
model.eval()
#vocoder
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
vocoder.eval()
#speaker embedding
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
#cleaner text
def convert_string_to_numbers(input_str):
try:
# Replace comma with period and attempt to convert the string to a float
num = float(input_str.replace(',', '.'))
if num.is_integer():
return int(num)
return num
except ValueError:
# If it's not a valid float, check if it's an integer or a negative integer
if input_str.replace('.', '', 1).isdigit(): # Remove one dot for checking integers
return int(input_str.replace(',', ''))
elif input_str[0] == '-' and input_str[1:].replace('.', '', 1).isdigit():
return int(input_str.replace(',', ''))
else:
raise ValueError("Invalid input: couldn't convert to a number")
def number_to_vietnamese_words(number):
ones = ['', 'một', 'hai', 'ba', 'bốn', 'năm', 'sáu', 'bảy', 'tám', 'chín']
tens = ['', 'mười', 'hai mươi', 'ba mươi', 'bốn mươi', 'năm mươi', 'sáu mươi', 'bảy mươi', 'tám mươi', 'chín mươi']
hundreds = ['', 'một trăm', 'Hai trăm', 'ba trăm', 'bốn trăm', 'năm trăm', 'sáu trăm', 'bảy trăm', 'tám trăm', 'chín trăm']
thousands = [''] + ['nghìn', 'triệu', 'tỷ']
def words(n):
if n < 10:
return ones[n]
elif n < 20:
return tens[n//10] + " " + words(n % 10)
elif n < 100:
return tens[n // 10] + ('' if n % 10 == 0 else ' ' + ones[n % 10])
else:
return hundreds[n // 100] + ('' if n % 100 == 0 else (' lẻ ' if n % 100 < 10 else ' ') + words(n % 100))
if number == 0:
return 'không'
integer_part = int(number)
decimal_part = round((number - integer_part) * 100) # Round the decimal part to 2 decimal places
result = []
i = 0
while integer_part > 0:
if integer_part % 1000 != 0:
result.append(words(integer_part % 1000) + (' ' + thousands[i] if i > 0 else ''))
integer_part //= 1000
i += 1
result_integer = ' '.join(result[::-1])
result_decimal = ''
if decimal_part > 0:
result_decimal = ' phẩy'
for digit in str(decimal_part):
result_decimal += ' ' + ones[int(digit)]
return result_integer + result_decimal
def is_num(string):
try:
float(string)
except ValueError:
return False
return True
def normalize(input):
input = input.lower()
newstr = map(lambda x: number_to_vietnamese_words(convert_string_to_numbers(x)) if is_num(x) else x, input.split(" "))
return ' '.join(newstr)
def split_paragraph_into_sentences(paragraph, max_chars = 300):
sentences = []
words = paragraph.split()
current_sentence = words[0]
for word in words[1:]:
if len(current_sentence) + len(word) + 1 <= max_chars:
current_sentence += ' ' + word
else:
sentences.append(current_sentence)
current_sentence = word
if current_sentence:
sentences.append(current_sentence)
return sentences
# generator speech
def text_to_speech(paragraph):
try:
paragraph = normalize(paragraph)
except:
paragraph = paragraph.lower()
list_sentence = split_paragraph_into_sentences(paragraph)
final_speech = np.array([])
for sentence in list_sentence:
inputs = processor(text=sentence, return_tensors="pt")
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings,vocoder=vocoder)
final_speech = np.concatenate((final_speech, speech.numpy()))
sf.write("tts_example.wav", final_speech, samplerate=16000)
return "tts_example.wav"
tts_examples = [
"xin chào mọi người, đây là sản phẩm thử nghiệm cho tiếng việt.",
"Mình sẽ tổ chức sinh nhật vào thứ 6 ngày 7 tháng này",
]
#gradio interface
iface = gr.Interface(
fn=text_to_speech,
inputs=gr.Textbox(),
outputs=gr.Audio(),
title="Text-to-Speech",
examples=tts_examples,
description="Give me something to say!",
)
iface.launch()