# Based on example code of https://huggingface.co/facebook/m2m100_1.2B # and https://github.com/wannaphong/ttsmms # See also https://github.com/facebookresearch/fairseq/blob/main/examples/mms/README.md import gradio as gr import os import re import soundfile as sf import json import nltk from underthesea import sent_tokenize as vie_sent_tokenize # Vietnamese NLP toolkit from underthesea import text_normalize as vie_text_normalize from nltk import sent_tokenize as nltk_sent_tokenize from ttsmms import download from ttsmms import TTS from collections import OrderedDict import uuid import datetime import shutil from num2words import num2words this_description = """Text To Speech for [1000+ languages](https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html) - using [fairseq MMS TTS](https://github.com/facebookresearch/fairseq/blob/main/examples/mms/README.md) and [ttsmms](https://github.com/wannaphong/ttsmms) wrapper. Please note that for some languages, it may not pronounce all words correctly (yet). """ nltk.download("punkt") # Pre-download some languages tts_models = {} eng_path = download("eng", "./data") tts_models["eng"] = eng_path vie_path = download("vie", "./data") tts_models["vie"] = vie_path mya_path = download("mya", "./data") tts_models["mya"] = mya_path # Do some work in the user directory... # Load language codes from lang_code.json with ordered keys with open("lang_code.json") as f: lang_codes = json.load(f, object_pairs_hook=OrderedDict) lang_codes = {key + " (" + lang_codes[key] + ")": lang_codes[key] for key in lang_codes} # Extract language names language_names = list(lang_codes.keys()) # Load num2words_lang_map with open("num2words_lang_map.json") as f: num2words_lang_map = json.load(f, object_pairs_hook=OrderedDict) def convert_numbers_to_words_num2words(text, lang): # Find all numbers in the text using regex numbers = re.findall(r"\d+", text) # Sort numbers in descending order of length sorted_numbers = sorted(numbers, key=len, reverse=True) print(sorted_numbers) # Replace numbers with their word equivalents for number in sorted_numbers: number_word = num2words(int(number), lang=num2words_lang_map[lang][0]) text = text.replace(number, number_word) return text def convert_mya_numbers_to_words(text): from mm_num2word import mm_num2word, extract_num numbers = extract_num(text) sorted_numbers = sorted(numbers, key=len, reverse=True) print(sorted_numbers) for n in sorted_numbers: text = text.replace(n, mm_num2word(n)) return text def prepare_sentences(text, lang="mya"): sentences = [] # pre-process the text for some languages if lang.lower() == "mya": text = convert_mya_numbers_to_words(text) text = text.replace("\u104A", ",").replace("\u104B", ".") if lang in num2words_lang_map: print("num2words supports this lang", lang) text = convert_numbers_to_words_num2words(text, lang) print("Processed text", text) paragraphs = [paragraph for paragraph in text.split("\n") if paragraph.strip()] if lang.lower() == "vie": for paragraph in paragraphs: sentences_raw = vie_sent_tokenize(paragraph) sentences.extend( [ vie_text_normalize(sentence) for sentence in sentences_raw if sentence.strip() ] ) else: sentences = [ sentence for paragraph in paragraphs for sentence in nltk_sent_tokenize(paragraph) if sentence.strip() ] return sentences def list_dir(): # Get the current directory current_dir = os.getcwd() print(current_dir) # List all files in the current directory files = os.listdir(current_dir) # Filter the list to include only WAV files wav_files = [file for file in files if file.endswith(".wav")] # Print the list of WAV files for wav_file in wav_files: print(wav_file) def combine_wav(source_dir, stamp): # Get a list of all WAV files in the folder wav_files = [file for file in os.listdir(source_dir) if file.endswith(".wav")] # Sort the files alphabetically to ensure the correct order of combination wav_files.sort() # Combine the WAV files combined_data = [] for file in wav_files: file_path = os.path.join(source_dir, file) data, sr = sf.read(file_path) combined_data.extend(data) # Save the combined audio to a new WAV file combined_file_path = f"{stamp}.wav" sf.write(combined_file_path, combined_data, sr) shutil.rmtree(source_dir) list_dir() # Display the combined audio in the Hugging Face Space app return combined_file_path def mms_tts(Input_Text, lang_name="Burmese (mya)"): lang_code = lang_codes[lang_name] user_model = download(lang_code, "./data") tts = TTS(user_model) sentences = prepare_sentences(Input_Text, lang_code) # output_dir = f"out_{lang_code}" current_datetime = datetime.datetime.now() timestamp = current_datetime.strftime("%Y%m%d%H%M%S%f") user_dir = f"u_{timestamp}" if os.path.exists(user_dir): session_id = str(uuid.uuid4()) # Generate a random session ID user_dir = f"u_{session_id}_{timestamp}" os.makedirs(user_dir, exist_ok=True) print("New user directory", user_dir) for i, sentence in enumerate(sentences): tts.synthesis(sentence, wav_path=f"{user_dir}/s_{str(i).zfill(10)}.wav") combined_file_path = combine_wav(user_dir, timestamp) return combined_file_path # common_languages = ["eng", "mya", "vie"] # List of common language codes iface = gr.Interface( fn=mms_tts, title="Massively Multilingual Speech (MMS) - Text To Speech", description=this_description, inputs=[ gr.Textbox(lines=5, placeholder="Enter text to speech", label="Input text"), gr.Dropdown( choices=language_names, label="Select language 1,000+", value="Burmese (mya)", ), ], outputs="audio", ) # outputs=[ # "audio", # gr.File(label="Download", type="file", download_to="done.wav") # ]) iface.launch()