File size: 6,303 Bytes
c9574d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7b6aa43
c9574d9
 
 
ba0fb36
 
 
c9574d9
ba0fb36
 
c9574d9
 
 
 
 
 
 
 
ba0fb36
c9574d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a82b119
c9574d9
ba0fb36
 
 
c9574d9
 
7b6aa43
c9574d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7b6aa43
c9574d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7b6aa43
c9574d9
 
 
 
 
 
 
 
 
 
7b6aa43
c9574d9
7b6aa43
 
 
c9574d9
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
# Based on example code of https://huggingface.co/facebook/m2m100_1.2B
# and https://github.com/wannaphong/ttsmms
# See also https://github.com/facebookresearch/fairseq/blob/main/examples/mms/README.md

import gradio as gr
import os
import re
import soundfile as sf

import json
import nltk
from underthesea import sent_tokenize as vie_sent_tokenize  # Vietnamese NLP toolkit
from underthesea import text_normalize as vie_text_normalize
from nltk import sent_tokenize as nltk_sent_tokenize
from ttsmms import download
from ttsmms import TTS

from collections import OrderedDict
import uuid
import datetime
import shutil
from num2words import num2words


this_description = """Text To Speech for [1000+ languages](https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html) - using [fairseq MMS TTS](https://github.com/facebookresearch/fairseq/blob/main/examples/mms/README.md) and [ttsmms](https://github.com/wannaphong/ttsmms) wrapper.
Please note that for some languages, it may not pronounce all words correctly (yet).
"""

nltk.download("punkt")

# Pre-download some languages
tts_models = {}
eng_path = download("eng", "./data")
tts_models["eng"] = eng_path
vie_path = download("vie", "./data")
tts_models["vie"] = vie_path
mya_path = download("mya", "./data")
tts_models["mya"] = mya_path

# Do some work in the user directory...

# Load language codes from lang_code.json with ordered keys
with open("lang_code.json") as f:
    lang_codes = json.load(f, object_pairs_hook=OrderedDict)

lang_codes = {key + " (" + lang_codes[key] + ")": lang_codes[key] for key in lang_codes}
# Extract language names
language_names = list(lang_codes.keys())

# Load num2words_lang_map
with open("num2words_lang_map.json") as f:
    num2words_lang_map = json.load(f, object_pairs_hook=OrderedDict)


def convert_numbers_to_words_num2words(text, lang):
    # Find all numbers in the text using regex
    numbers = re.findall(r"\d+", text)
    # Sort numbers in descending order of length
    sorted_numbers = sorted(numbers, key=len, reverse=True)
    print(sorted_numbers)

    # Replace numbers with their word equivalents
    for number in sorted_numbers:
        number_word = num2words(int(number), lang=num2words_lang_map[lang][0])
        text = text.replace(number, number_word)

    return text


def convert_mya_numbers_to_words(text):
    from mm_num2word import mm_num2word, extract_num

    numbers = extract_num(text)
    sorted_numbers = sorted(numbers, key=len, reverse=True)
    print(sorted_numbers)

    for n in sorted_numbers:
        text = text.replace(n, mm_num2word(n))
    return text


def prepare_sentences(text, lang="mya"):
    sentences = []
    # pre-process the text for some languages
    if lang.lower() == "mya":
        text = convert_mya_numbers_to_words(text)
        text = text.replace("\u104A", ",").replace("\u104B", ".")

    if lang in num2words_lang_map:
        print("num2words supports this lang", lang)
        text = convert_numbers_to_words_num2words(text, lang)
    print("Processed text", text)

    paragraphs = [paragraph for paragraph in text.split("\n") if paragraph.strip()]

    if lang.lower() == "vie":
        for paragraph in paragraphs:
            sentences_raw = vie_sent_tokenize(paragraph)
            sentences.extend(
                [
                    vie_text_normalize(sentence)
                    for sentence in sentences_raw
                    if sentence.strip()
                ]
            )
    else:
        sentences = [
            sentence
            for paragraph in paragraphs
            for sentence in nltk_sent_tokenize(paragraph)
            if sentence.strip()
        ]
    return sentences


def list_dir():
    # Get the current directory
    current_dir = os.getcwd()
    print(current_dir)

    # List all files in the current directory
    files = os.listdir(current_dir)

    # Filter the list to include only WAV files
    wav_files = [file for file in files if file.endswith(".wav")]

    # Print the list of WAV files
    for wav_file in wav_files:
        print(wav_file)


def combine_wav(source_dir, stamp):
    # Get a list of all WAV files in the folder
    wav_files = [file for file in os.listdir(source_dir) if file.endswith(".wav")]

    # Sort the files alphabetically to ensure the correct order of combination
    wav_files.sort()

    # Combine the WAV files
    combined_data = []
    for file in wav_files:
        file_path = os.path.join(source_dir, file)
        data, sr = sf.read(file_path)
        combined_data.extend(data)

    # Save the combined audio to a new WAV file
    combined_file_path = f"{stamp}.wav"
    sf.write(combined_file_path, combined_data, sr)

    shutil.rmtree(source_dir)
    list_dir()

    # Display the combined audio in the Hugging Face Space app
    return combined_file_path


def mms_tts(Input_Text, lang_name="Burmese (mya)"):
    lang_code = lang_codes[lang_name]

    user_model = download(lang_code, "./data")
    tts = TTS(user_model)

    sentences = prepare_sentences(Input_Text, lang_code)

    # output_dir = f"out_{lang_code}"
    current_datetime = datetime.datetime.now()
    timestamp = current_datetime.strftime("%Y%m%d%H%M%S%f")

    user_dir = f"u_{timestamp}"
    if os.path.exists(user_dir):
        session_id = str(uuid.uuid4())  # Generate a random session ID
        user_dir = f"u_{session_id}_{timestamp}"
    os.makedirs(user_dir, exist_ok=True)
    print("New user directory", user_dir)

    for i, sentence in enumerate(sentences):
        tts.synthesis(sentence, wav_path=f"{user_dir}/s_{str(i).zfill(10)}.wav")
    combined_file_path = combine_wav(user_dir, timestamp)
    return combined_file_path


# common_languages = ["eng", "mya", "vie"]  # List of common language codes
iface = gr.Interface(
    fn=mms_tts,
    title="Massively Multilingual Speech (MMS) - Text To Speech",
    description=this_description,
    inputs=[
        gr.Textbox(lines=5, placeholder="Enter text to speech", label="Input text"),
        gr.Dropdown(
            choices=language_names,
            label="Select language 1,000+",
            value="Burmese (mya)",
        ),
    ],
    outputs="audio",
)
# outputs=[
#         "audio",
#         gr.File(label="Download", type="file", download_to="done.wav")
#     ])


iface.launch()