Spaces:
Running
Running
File size: 6,303 Bytes
c9574d9 7b6aa43 c9574d9 ba0fb36 c9574d9 ba0fb36 c9574d9 ba0fb36 c9574d9 a82b119 c9574d9 ba0fb36 c9574d9 7b6aa43 c9574d9 7b6aa43 c9574d9 7b6aa43 c9574d9 7b6aa43 c9574d9 7b6aa43 c9574d9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 |
# Based on example code of https://huggingface.co/facebook/m2m100_1.2B
# and https://github.com/wannaphong/ttsmms
# See also https://github.com/facebookresearch/fairseq/blob/main/examples/mms/README.md
import gradio as gr
import os
import re
import soundfile as sf
import json
import nltk
from underthesea import sent_tokenize as vie_sent_tokenize # Vietnamese NLP toolkit
from underthesea import text_normalize as vie_text_normalize
from nltk import sent_tokenize as nltk_sent_tokenize
from ttsmms import download
from ttsmms import TTS
from collections import OrderedDict
import uuid
import datetime
import shutil
from num2words import num2words
this_description = """Text To Speech for [1000+ languages](https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html) - using [fairseq MMS TTS](https://github.com/facebookresearch/fairseq/blob/main/examples/mms/README.md) and [ttsmms](https://github.com/wannaphong/ttsmms) wrapper.
Please note that for some languages, it may not pronounce all words correctly (yet).
"""
nltk.download("punkt")
# Pre-download some languages
tts_models = {}
eng_path = download("eng", "./data")
tts_models["eng"] = eng_path
vie_path = download("vie", "./data")
tts_models["vie"] = vie_path
mya_path = download("mya", "./data")
tts_models["mya"] = mya_path
# Do some work in the user directory...
# Load language codes from lang_code.json with ordered keys
with open("lang_code.json") as f:
lang_codes = json.load(f, object_pairs_hook=OrderedDict)
lang_codes = {key + " (" + lang_codes[key] + ")": lang_codes[key] for key in lang_codes}
# Extract language names
language_names = list(lang_codes.keys())
# Load num2words_lang_map
with open("num2words_lang_map.json") as f:
num2words_lang_map = json.load(f, object_pairs_hook=OrderedDict)
def convert_numbers_to_words_num2words(text, lang):
# Find all numbers in the text using regex
numbers = re.findall(r"\d+", text)
# Sort numbers in descending order of length
sorted_numbers = sorted(numbers, key=len, reverse=True)
print(sorted_numbers)
# Replace numbers with their word equivalents
for number in sorted_numbers:
number_word = num2words(int(number), lang=num2words_lang_map[lang][0])
text = text.replace(number, number_word)
return text
def convert_mya_numbers_to_words(text):
from mm_num2word import mm_num2word, extract_num
numbers = extract_num(text)
sorted_numbers = sorted(numbers, key=len, reverse=True)
print(sorted_numbers)
for n in sorted_numbers:
text = text.replace(n, mm_num2word(n))
return text
def prepare_sentences(text, lang="mya"):
sentences = []
# pre-process the text for some languages
if lang.lower() == "mya":
text = convert_mya_numbers_to_words(text)
text = text.replace("\u104A", ",").replace("\u104B", ".")
if lang in num2words_lang_map:
print("num2words supports this lang", lang)
text = convert_numbers_to_words_num2words(text, lang)
print("Processed text", text)
paragraphs = [paragraph for paragraph in text.split("\n") if paragraph.strip()]
if lang.lower() == "vie":
for paragraph in paragraphs:
sentences_raw = vie_sent_tokenize(paragraph)
sentences.extend(
[
vie_text_normalize(sentence)
for sentence in sentences_raw
if sentence.strip()
]
)
else:
sentences = [
sentence
for paragraph in paragraphs
for sentence in nltk_sent_tokenize(paragraph)
if sentence.strip()
]
return sentences
def list_dir():
# Get the current directory
current_dir = os.getcwd()
print(current_dir)
# List all files in the current directory
files = os.listdir(current_dir)
# Filter the list to include only WAV files
wav_files = [file for file in files if file.endswith(".wav")]
# Print the list of WAV files
for wav_file in wav_files:
print(wav_file)
def combine_wav(source_dir, stamp):
# Get a list of all WAV files in the folder
wav_files = [file for file in os.listdir(source_dir) if file.endswith(".wav")]
# Sort the files alphabetically to ensure the correct order of combination
wav_files.sort()
# Combine the WAV files
combined_data = []
for file in wav_files:
file_path = os.path.join(source_dir, file)
data, sr = sf.read(file_path)
combined_data.extend(data)
# Save the combined audio to a new WAV file
combined_file_path = f"{stamp}.wav"
sf.write(combined_file_path, combined_data, sr)
shutil.rmtree(source_dir)
list_dir()
# Display the combined audio in the Hugging Face Space app
return combined_file_path
def mms_tts(Input_Text, lang_name="Burmese (mya)"):
lang_code = lang_codes[lang_name]
user_model = download(lang_code, "./data")
tts = TTS(user_model)
sentences = prepare_sentences(Input_Text, lang_code)
# output_dir = f"out_{lang_code}"
current_datetime = datetime.datetime.now()
timestamp = current_datetime.strftime("%Y%m%d%H%M%S%f")
user_dir = f"u_{timestamp}"
if os.path.exists(user_dir):
session_id = str(uuid.uuid4()) # Generate a random session ID
user_dir = f"u_{session_id}_{timestamp}"
os.makedirs(user_dir, exist_ok=True)
print("New user directory", user_dir)
for i, sentence in enumerate(sentences):
tts.synthesis(sentence, wav_path=f"{user_dir}/s_{str(i).zfill(10)}.wav")
combined_file_path = combine_wav(user_dir, timestamp)
return combined_file_path
# common_languages = ["eng", "mya", "vie"] # List of common language codes
iface = gr.Interface(
fn=mms_tts,
title="Massively Multilingual Speech (MMS) - Text To Speech",
description=this_description,
inputs=[
gr.Textbox(lines=5, placeholder="Enter text to speech", label="Input text"),
gr.Dropdown(
choices=language_names,
label="Select language 1,000+",
value="Burmese (mya)",
),
],
outputs="audio",
)
# outputs=[
# "audio",
# gr.File(label="Download", type="file", download_to="done.wav")
# ])
iface.launch()
|