Spaces:
Runtime error
Runtime error
| import torch | |
| import soundfile as sf | |
| import gradio as gr | |
| from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech | |
| from transformers import SpeechT5HifiGan | |
| from datasets import load_dataset | |
| from IPython.display import Audio | |
| import numpy as np | |
| model_name = "trangiabao17032000/final_tts" | |
| #processor | |
| processor = SpeechT5Processor.from_pretrained(model_name) | |
| tokenizer = processor.tokenizer | |
| #model | |
| model = SpeechT5ForTextToSpeech.from_pretrained(model_name) | |
| model.resize_token_embeddings(len(tokenizer)) | |
| model.eval() | |
| #vocoder | |
| vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") | |
| vocoder.eval() | |
| #speaker embedding | |
| embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") | |
| speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) | |
| #cleaner text | |
| def convert_string_to_numbers(input_str): | |
| try: | |
| # Replace comma with period and attempt to convert the string to a float | |
| num = float(input_str.replace(',', '.')) | |
| if num.is_integer(): | |
| return int(num) | |
| return num | |
| except ValueError: | |
| # If it's not a valid float, check if it's an integer or a negative integer | |
| if input_str.replace('.', '', 1).isdigit(): # Remove one dot for checking integers | |
| return int(input_str.replace(',', '')) | |
| elif input_str[0] == '-' and input_str[1:].replace('.', '', 1).isdigit(): | |
| return int(input_str.replace(',', '')) | |
| else: | |
| raise ValueError("Invalid input: couldn't convert to a number") | |
| def number_to_vietnamese_words(number): | |
| ones = ['', 'một', 'hai', 'ba', 'bốn', 'năm', 'sáu', 'bảy', 'tám', 'chín'] | |
| tens = ['', 'mười', 'hai mươi', 'ba mươi', 'bốn mươi', 'năm mươi', 'sáu mươi', 'bảy mươi', 'tám mươi', 'chín mươi'] | |
| hundreds = ['', 'một trăm', 'Hai trăm', 'ba trăm', 'bốn trăm', 'năm trăm', 'sáu trăm', 'bảy trăm', 'tám trăm', 'chín trăm'] | |
| thousands = [''] + ['nghìn', 'triệu', 'tỷ'] | |
| def words(n): | |
| if n < 10: | |
| return ones[n] | |
| elif n < 20: | |
| return tens[n//10] + " " + words(n % 10) | |
| elif n < 100: | |
| return tens[n // 10] + ('' if n % 10 == 0 else ' ' + ones[n % 10]) | |
| else: | |
| return hundreds[n // 100] + ('' if n % 100 == 0 else (' lẻ ' if n % 100 < 10 else ' ') + words(n % 100)) | |
| if number == 0: | |
| return 'không' | |
| integer_part = int(number) | |
| decimal_part = round((number - integer_part) * 100) # Round the decimal part to 2 decimal places | |
| result = [] | |
| i = 0 | |
| while integer_part > 0: | |
| if integer_part % 1000 != 0: | |
| result.append(words(integer_part % 1000) + (' ' + thousands[i] if i > 0 else '')) | |
| integer_part //= 1000 | |
| i += 1 | |
| result_integer = ' '.join(result[::-1]) | |
| result_decimal = '' | |
| if decimal_part > 0: | |
| result_decimal = ' phẩy' | |
| for digit in str(decimal_part): | |
| result_decimal += ' ' + ones[int(digit)] | |
| return result_integer + result_decimal | |
| def is_num(string): | |
| try: | |
| float(string) | |
| except ValueError: | |
| return False | |
| return True | |
| def normalize(input): | |
| input = input.lower() | |
| newstr = map(lambda x: number_to_vietnamese_words(convert_string_to_numbers(x)) if is_num(x) else x, input.split(" ")) | |
| return ' '.join(newstr) | |
| def split_paragraph_into_sentences(paragraph, max_chars = 300): | |
| sentences = [] | |
| words = paragraph.split() | |
| current_sentence = words[0] | |
| for word in words[1:]: | |
| if len(current_sentence) + len(word) + 1 <= max_chars: | |
| current_sentence += ' ' + word | |
| else: | |
| sentences.append(current_sentence) | |
| current_sentence = word | |
| if current_sentence: | |
| sentences.append(current_sentence) | |
| return sentences | |
| # generator speech | |
| def text_to_speech(paragraph): | |
| try: | |
| paragraph = normalize(paragraph) | |
| except: | |
| paragraph = paragraph.lower() | |
| list_sentence = split_paragraph_into_sentences(paragraph) | |
| final_speech = np.array([]) | |
| for sentence in list_sentence: | |
| inputs = processor(text=sentence, return_tensors="pt") | |
| speech = model.generate_speech(inputs["input_ids"], speaker_embeddings,vocoder=vocoder) | |
| final_speech = np.concatenate((final_speech, speech.numpy())) | |
| sf.write("tts_example.wav", final_speech, samplerate=16000) | |
| return "tts_example.wav" | |
| tts_examples = [ | |
| "xin chào mọi người, đây là sản phẩm thử nghiệm cho tiếng việt.", | |
| "Mình sẽ tổ chức sinh nhật vào thứ 6 ngày 7 tháng này", | |
| ] | |
| #gradio interface | |
| iface = gr.Interface( | |
| fn=text_to_speech, | |
| inputs=gr.Textbox(), | |
| outputs=gr.Audio(), | |
| title="Text-to-Speech", | |
| examples=tts_examples, | |
| description="Give me something to say!", | |
| ) | |
| iface.launch() |