final_tts_mix

Runtime error

final_tts_mix / app.py

TranGiaBao

Duplicate from trangiabao17032000/final_tts

77852b1 about 2 years ago

4.91 kB

	import torch
	import soundfile as sf
	import gradio as gr
	from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech
	from transformers import SpeechT5HifiGan
	from datasets import load_dataset
	from IPython.display import Audio
	import numpy as np
	model_name = "trangiabao17032000/final_tts"

	#processor
	processor = SpeechT5Processor.from_pretrained(model_name)
	tokenizer = processor.tokenizer

	#model
	model = SpeechT5ForTextToSpeech.from_pretrained(model_name)
	model.resize_token_embeddings(len(tokenizer))
	model.eval()

	#vocoder
	vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
	vocoder.eval()

	#speaker embedding
	embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
	speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)

	#cleaner text
	def convert_string_to_numbers(input_str):
	try:
	# Replace comma with period and attempt to convert the string to a float
	num = float(input_str.replace(',', '.'))
	if num.is_integer():
	return int(num)
	return num
	except ValueError:
	# If it's not a valid float, check if it's an integer or a negative integer
	if input_str.replace('.', '', 1).isdigit(): # Remove one dot for checking integers
	return int(input_str.replace(',', ''))
	elif input_str[0] == '-' and input_str[1:].replace('.', '', 1).isdigit():
	return int(input_str.replace(',', ''))
	else:
	raise ValueError("Invalid input: couldn't convert to a number")

	def number_to_vietnamese_words(number):
	ones = ['', 'một', 'hai', 'ba', 'bốn', 'năm', 'sáu', 'bảy', 'tám', 'chín']
	tens = ['', 'mười', 'hai mươi', 'ba mươi', 'bốn mươi', 'năm mươi', 'sáu mươi', 'bảy mươi', 'tám mươi', 'chín mươi']
	hundreds = ['', 'một trăm', 'Hai trăm', 'ba trăm', 'bốn trăm', 'năm trăm', 'sáu trăm', 'bảy trăm', 'tám trăm', 'chín trăm']
	thousands = [''] + ['nghìn', 'triệu', 'tỷ']

	def words(n):

	if n < 10:
	return ones[n]
	elif n < 20:
	return tens[n//10] + " " + words(n % 10)
	elif n < 100:
	return tens[n // 10] + ('' if n % 10 == 0 else ' ' + ones[n % 10])
	else:
	return hundreds[n // 100] + ('' if n % 100 == 0 else (' lẻ ' if n % 100 < 10 else ' ') + words(n % 100))

	if number == 0:
	return 'không'

	integer_part = int(number)
	decimal_part = round((number - integer_part) * 100) # Round the decimal part to 2 decimal places

	result = []
	i = 0
	while integer_part > 0:
	if integer_part % 1000 != 0:
	result.append(words(integer_part % 1000) + (' ' + thousands[i] if i > 0 else ''))
	integer_part //= 1000
	i += 1

	result_integer = ' '.join(result[::-1])

	result_decimal = ''
	if decimal_part > 0:
	result_decimal = ' phẩy'
	for digit in str(decimal_part):
	result_decimal += ' ' + ones[int(digit)]

	return result_integer + result_decimal

	def is_num(string):
	try:
	float(string)
	except ValueError:
	return False
	return True

	def normalize(input):
	input = input.lower()
	newstr = map(lambda x: number_to_vietnamese_words(convert_string_to_numbers(x)) if is_num(x) else x, input.split(" "))
	return ' '.join(newstr)

	def split_paragraph_into_sentences(paragraph, max_chars = 300):
	sentences = []
	words = paragraph.split()
	current_sentence = words[0]

	for word in words[1:]:
	if len(current_sentence) + len(word) + 1 <= max_chars:
	current_sentence += ' ' + word
	else:
	sentences.append(current_sentence)
	current_sentence = word

	if current_sentence:
	sentences.append(current_sentence)

	return sentences

	# generator speech
	def text_to_speech(paragraph):
	try:
	paragraph = normalize(paragraph)
	except:
	paragraph = paragraph.lower()
	list_sentence = split_paragraph_into_sentences(paragraph)
	final_speech = np.array([])

	for sentence in list_sentence:

	inputs = processor(text=sentence, return_tensors="pt")
	speech = model.generate_speech(inputs["input_ids"], speaker_embeddings,vocoder=vocoder)
	final_speech = np.concatenate((final_speech, speech.numpy()))

	sf.write("tts_example.wav", final_speech, samplerate=16000)
	return "tts_example.wav"

	tts_examples = [
	"xin chào mọi người, đây là sản phẩm thử nghiệm cho tiếng việt.",
	"Mình sẽ tổ chức sinh nhật vào thứ 6 ngày 7 tháng này",
	]

	#gradio interface
	iface = gr.Interface(
	fn=text_to_speech,
	inputs=gr.Textbox(),
	outputs=gr.Audio(),
	title="Text-to-Speech",
	examples=tts_examples,
	description="Give me something to say!",
	)

	iface.launch()