import torch import soundfile as sf import gradio as gr from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech from transformers import SpeechT5HifiGan from datasets import load_dataset from IPython.display import Audio import numpy as np model_name = "trangiabao17032000/final_tts" #processor processor = SpeechT5Processor.from_pretrained(model_name) tokenizer = processor.tokenizer #model model = SpeechT5ForTextToSpeech.from_pretrained(model_name) model.resize_token_embeddings(len(tokenizer)) model.eval() #vocoder vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") vocoder.eval() #speaker embedding embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) #cleaner text def convert_string_to_numbers(input_str): try: # Replace comma with period and attempt to convert the string to a float num = float(input_str.replace(',', '.')) if num.is_integer(): return int(num) return num except ValueError: # If it's not a valid float, check if it's an integer or a negative integer if input_str.replace('.', '', 1).isdigit(): # Remove one dot for checking integers return int(input_str.replace(',', '')) elif input_str[0] == '-' and input_str[1:].replace('.', '', 1).isdigit(): return int(input_str.replace(',', '')) else: raise ValueError("Invalid input: couldn't convert to a number") def number_to_vietnamese_words(number): ones = ['', 'một', 'hai', 'ba', 'bốn', 'năm', 'sáu', 'bảy', 'tám', 'chín'] tens = ['', 'mười', 'hai mươi', 'ba mươi', 'bốn mươi', 'năm mươi', 'sáu mươi', 'bảy mươi', 'tám mươi', 'chín mươi'] hundreds = ['', 'một trăm', 'Hai trăm', 'ba trăm', 'bốn trăm', 'năm trăm', 'sáu trăm', 'bảy trăm', 'tám trăm', 'chín trăm'] thousands = [''] + ['nghìn', 'triệu', 'tỷ'] def words(n): if n < 10: return ones[n] elif n < 20: return tens[n//10] + " " + words(n % 10) elif n < 100: return tens[n // 10] + ('' if n % 10 == 0 else ' ' + ones[n % 10]) else: return hundreds[n // 100] + ('' if n % 100 == 0 else (' lẻ ' if n % 100 < 10 else ' ') + words(n % 100)) if number == 0: return 'không' integer_part = int(number) decimal_part = round((number - integer_part) * 100) # Round the decimal part to 2 decimal places result = [] i = 0 while integer_part > 0: if integer_part % 1000 != 0: result.append(words(integer_part % 1000) + (' ' + thousands[i] if i > 0 else '')) integer_part //= 1000 i += 1 result_integer = ' '.join(result[::-1]) result_decimal = '' if decimal_part > 0: result_decimal = ' phẩy' for digit in str(decimal_part): result_decimal += ' ' + ones[int(digit)] return result_integer + result_decimal def is_num(string): try: float(string) except ValueError: return False return True def normalize(input): input = input.lower() newstr = map(lambda x: number_to_vietnamese_words(convert_string_to_numbers(x)) if is_num(x) else x, input.split(" ")) return ' '.join(newstr) def split_paragraph_into_sentences(paragraph, max_chars = 300): sentences = [] words = paragraph.split() current_sentence = words[0] for word in words[1:]: if len(current_sentence) + len(word) + 1 <= max_chars: current_sentence += ' ' + word else: sentences.append(current_sentence) current_sentence = word if current_sentence: sentences.append(current_sentence) return sentences # generator speech def text_to_speech(paragraph): try: paragraph = normalize(paragraph) except: paragraph = paragraph.lower() list_sentence = split_paragraph_into_sentences(paragraph) final_speech = np.array([]) for sentence in list_sentence: inputs = processor(text=sentence, return_tensors="pt") speech = model.generate_speech(inputs["input_ids"], speaker_embeddings,vocoder=vocoder) final_speech = np.concatenate((final_speech, speech.numpy())) sf.write("tts_example.wav", final_speech, samplerate=16000) return "tts_example.wav" tts_examples = [ "xin chào mọi người, đây là sản phẩm thử nghiệm cho tiếng việt.", "Mình sẽ tổ chức sinh nhật vào thứ 6 ngày 7 tháng này", ] #gradio interface iface = gr.Interface( fn=text_to_speech, inputs=gr.Textbox(), outputs=gr.Audio(), title="Text-to-Speech", examples=tts_examples, description="Give me something to say!", ) iface.launch()