Spaces:
Running
Running
File size: 2,815 Bytes
e4eb5c5 944dedf 4492d6d 19be65d a82f51b 93b0a99 4fda610 a82f51b 4fda610 a82f51b 4eb15f6 944dedf 8ce2dae 19be65d 7e5c84b 686e3d3 944dedf f6a94c1 ab3b67e f6a94c1 944dedf 1782e10 0f3a7cd f6a94c1 ab3b67e 1782e10 ab3b67e 944dedf f6a94c1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 |
import gradio as gr
import wave
import numpy as np
from io import BytesIO
from huggingface_hub import hf_hub_download
from piper import PiperVoice
from transformers import pipeline
import hazm
import typing
normalizer = hazm.Normalizer()
sent_tokenizer = hazm.SentenceTokenizer()
word_tokenizer = hazm.WordTokenizer()
tagger_path = hf_hub_download(repo_id="gyroing/HAZM_POS_TAGGER", filename="pos_tagger.model")
tagger = hazm.POSTagger(model=tagger_path)
def preprocess_text(text: str) -> typing.List[typing.List[str]]:
"""Split/normalize text into sentences/words with hazm"""
text = normalizer.normalize(text)
processed_sentences = []
for sentence in sent_tokenizer.tokenize(text):
words = word_tokenizer.tokenize(sentence)
processed_words = fix_words(words)
processed_sentences.append(" ".join(processed_words))
return " ".join(processed_sentences)
def fix_words(words: typing.List[str]) -> typing.List[str]:
fixed_words = []
for word, pos in tagger.tag(words):
if pos[-1] == "Z":
if word[-1] != "ِ":
if (word[-1] == "ه") and (word[-2] != "ا"):
word += "ی"
word += "ِ"
fixed_words.append(word)
return fixed_words
def synthesize_speech(text):
model_path = hf_hub_download(repo_id="gyroing/Persian-Piper-Model-gyro", filename="fa_IR-gyro-meduim.onnx")
config_path = hf_hub_download(repo_id="gyroing/Persian-Piper-Model-gyro", filename="fa_IR-gyro-meduim.onnx.json")
voice = PiperVoice.load(model_path, config_path)
# Create an in-memory buffer for the WAV file
buffer = BytesIO()
with wave.open(buffer, 'wb') as wav_file:
wav_file.setframerate(voice.config.sample_rate)
wav_file.setsampwidth(2) # 16-bit
wav_file.setnchannels(1) # mono
# Synthesize speech
eztext = preprocess_text(text)
voice.synthesize(eztext, wav_file)
# Convert buffer to NumPy array for Gradio output
buffer.seek(0)
audio_data = np.frombuffer(buffer.read(), dtype=np.int16)
return audio_data.tobytes(), None
# Using Gradio Blocks
with gr.Blocks(theme=gr.themes.Base()) as blocks:
gr.Markdown("# Text to Speech Synthesizer")
gr.Markdown("Enter text to synthesize it into speech using PiperVoice.")
input_text = gr.Textbox(label="Input Text")
output_audio = gr.Audio(label="Synthesized Speech", type="numpy")
output_text = gr.Textbox(label="Output Text", visible=False) # This is the new text output component
submit_button = gr.Button("Synthesize")
submit_button.click(synthesize_speech, inputs=input_text, outputs=[output_audio, output_text])
# Run the app
blocks.launch() |