Update app.py
Browse files
app.py
CHANGED
@@ -1,47 +1,24 @@
|
|
1 |
-
import spaces
|
2 |
-
|
3 |
import gradio as gr
|
4 |
import torch
|
5 |
from transformers import MarianTokenizer, MarianMTModel
|
6 |
from parler_tts import ParlerTTSForConditionalGeneration
|
7 |
-
from transformers import AutoTokenizer
|
8 |
-
import soundfile as sf
|
9 |
-
from pydub import AudioSegment
|
10 |
-
import os
|
11 |
-
import re
|
12 |
from PyPDF2 import PdfReader
|
|
|
13 |
import textwrap
|
|
|
14 |
|
15 |
# Device configuration
|
16 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
17 |
|
18 |
-
# Initialize models and tokenizers
|
19 |
tts_model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-large-v1").to(device)
|
20 |
tts_tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-large-v1")
|
|
|
|
|
21 |
|
22 |
-
#
|
23 |
-
@spaces.GPU(duration=120)
|
24 |
-
def translate(source_text, source_lang, target_lang, batch_size=16):
|
25 |
-
model_name = f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"
|
26 |
-
|
27 |
-
tokenizer = MarianTokenizer.from_pretrained(model_name)
|
28 |
-
model = MarianMTModel.from_pretrained(model_name).to(device)
|
29 |
-
|
30 |
-
text_chunks = textwrap.wrap(source_text, 512)
|
31 |
-
translated_text = ""
|
32 |
-
|
33 |
-
for i in range(0, len(text_chunks), batch_size):
|
34 |
-
text_batch = text_chunks[i:i+batch_size]
|
35 |
-
input_ids = tokenizer(text_batch, return_tensors="pt", padding=True, truncation=True, max_length=512).input_ids.to(device)
|
36 |
-
output_ids = model.generate(input_ids, max_new_tokens=512)
|
37 |
-
|
38 |
-
for output in output_ids:
|
39 |
-
output_text = tokenizer.decode(output, skip_special_tokens=True)
|
40 |
-
translated_text += output_text + " "
|
41 |
-
|
42 |
-
return translated_text
|
43 |
-
|
44 |
-
# Function to extract text from PDF
|
45 |
def pdf_to_text(pdf_path):
|
46 |
with open(pdf_path, 'rb') as file:
|
47 |
pdf_reader = PdfReader(file)
|
@@ -51,101 +28,64 @@ def pdf_to_text(pdf_path):
|
|
51 |
text += page.extract_text()
|
52 |
return text
|
53 |
|
54 |
-
#
|
55 |
def split_text_into_sentences(text):
|
56 |
sentence_endings = re.compile(r'[.!?]')
|
57 |
sentences = sentence_endings.split(text)
|
58 |
return [sentence.strip() for sentence in sentences if sentence.strip()]
|
59 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
# Function to generate audio for a single sentence
|
61 |
@spaces.GPU(duration=120)
|
62 |
-
def generate_single_wav_from_text(sentence, description
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
|
|
|
|
|
|
|
|
67 |
audio_arr = generation.cpu().numpy().squeeze()
|
68 |
-
output_file = f"
|
69 |
-
sf.write(output_file, audio_arr,
|
70 |
-
return
|
71 |
-
|
72 |
-
# Function to combine audio files
|
73 |
-
def combine_wav_files(output_file, *input_files, silence_duration=500):
|
74 |
-
combined = AudioSegment.empty()
|
75 |
-
one_second_silence = AudioSegment.silent(duration=silence_duration)
|
76 |
-
|
77 |
-
for file in input_files:
|
78 |
-
audio = AudioSegment.from_wav(file)
|
79 |
-
combined += audio + one_second_silence
|
80 |
-
|
81 |
-
combined.export(output_file, format='wav')
|
82 |
-
|
83 |
-
# Function to update target language options based on the source language
|
84 |
-
def update_target_lang_options(source_lang):
|
85 |
-
options = {
|
86 |
-
"en": ["de", "fr", "tr"],
|
87 |
-
"tr": ["en"],
|
88 |
-
"de": ["en", "fr"],
|
89 |
-
"fr": ["en", "de"]
|
90 |
-
}
|
91 |
-
return gr.update(choices=options.get(source_lang, []), value=options.get(source_lang, [])[0])
|
92 |
-
|
93 |
-
# Function to process sentences for audio generation
|
94 |
-
def process_sentences_for_audio(sentences, description, tts_model, tts_tokenizer):
|
95 |
-
audio_files = []
|
96 |
|
97 |
-
|
98 |
-
print(f"Generating audio for sentence {i+1}...")
|
99 |
-
output_file_prefix = f"sentence_{i+1}"
|
100 |
-
audio_file = generate_single_wav_from_text(sentence, description, output_file_prefix, tts_model, tts_tokenizer)
|
101 |
-
audio_files.append(audio_file)
|
102 |
-
|
103 |
-
yield sentence, audio_file
|
104 |
-
|
105 |
-
combined_output_file = "sentences_combined.wav"
|
106 |
-
combine_wav_files(combined_output_file, *audio_files)
|
107 |
-
|
108 |
-
yield None, combined_output_file
|
109 |
-
|
110 |
-
# Gradio interface
|
111 |
with gr.Blocks() as demo:
|
112 |
with gr.Row():
|
113 |
-
with gr.Column(
|
114 |
pdf_input = gr.File(label="Upload PDF", file_types=['pdf'])
|
115 |
-
|
116 |
-
source_lang = gr.Dropdown(choices=["en", "tr", "de", "fr"], label="Source Language", value="en", interactive=True)
|
117 |
-
target_lang = gr.Dropdown(choices=["tr"], label="Target Language", value="tr", interactive=True)
|
118 |
-
description = gr.Textbox(label="Voice Description",
|
119 |
value="Old man voice. Monotone voice tune from an old man, with a very close recording that almost has no background noise.")
|
120 |
-
|
121 |
-
with gr.Column(
|
122 |
-
gr.
|
123 |
-
output_group = gr.Group()
|
124 |
|
125 |
-
def handle_process(pdf_input,
|
|
|
126 |
text = pdf_to_text(pdf_input.name)
|
127 |
-
if translate_checkbox:
|
128 |
-
text = translate(text, source_lang, target_lang)
|
129 |
-
|
130 |
sentences = split_text_into_sentences(text)
|
131 |
-
for sentence, audio_file in process_sentences_for_audio(sentences, description, tts_model, tts_tokenizer):
|
132 |
-
if sentence:
|
133 |
-
with output_group:
|
134 |
-
gr.Markdown(f"**Sentence**: {sentence}")
|
135 |
-
gr.Audio(value=audio_file, label=sentence)
|
136 |
-
else:
|
137 |
-
with output_group:
|
138 |
-
gr.Markdown("### Combined Audio")
|
139 |
-
gr.Audio(value=audio_file, label="Combined Audio")
|
140 |
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
|
147 |
-
|
148 |
-
source_lang.change(fn=update_target_lang_options, inputs=source_lang, outputs=target_lang)
|
149 |
-
process_btn.click(handle_process, inputs=[pdf_input, translate_checkbox, source_lang, target_lang, description], outputs=[])
|
150 |
|
|
|
151 |
demo.launch()
|
|
|
1 |
+
import spaces
|
|
|
2 |
import gradio as gr
|
3 |
import torch
|
4 |
from transformers import MarianTokenizer, MarianMTModel
|
5 |
from parler_tts import ParlerTTSForConditionalGeneration
|
6 |
+
from transformers import AutoTokenizer, set_seed
|
|
|
|
|
|
|
|
|
7 |
from PyPDF2 import PdfReader
|
8 |
+
import re
|
9 |
import textwrap
|
10 |
+
import soundfile as sf
|
11 |
|
12 |
# Device configuration
|
13 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
14 |
|
15 |
+
# Initialize models and tokenizers
|
16 |
tts_model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-large-v1").to(device)
|
17 |
tts_tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-large-v1")
|
18 |
+
SAMPLE_RATE = 22050 # Adjust as needed
|
19 |
+
SEED = 42
|
20 |
|
21 |
+
# Helper function to extract text from a PDF
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
def pdf_to_text(pdf_path):
|
23 |
with open(pdf_path, 'rb') as file:
|
24 |
pdf_reader = PdfReader(file)
|
|
|
28 |
text += page.extract_text()
|
29 |
return text
|
30 |
|
31 |
+
# Helper function to split text into sentences using regex
|
32 |
def split_text_into_sentences(text):
|
33 |
sentence_endings = re.compile(r'[.!?]')
|
34 |
sentences = sentence_endings.split(text)
|
35 |
return [sentence.strip() for sentence in sentences if sentence.strip()]
|
36 |
|
37 |
+
# Helper function to preprocess the text (normalization, punctuation)
|
38 |
+
def preprocess(text):
|
39 |
+
text = text.replace("-", " ")
|
40 |
+
if text[-1] not in ".!?":
|
41 |
+
text += "."
|
42 |
+
return text
|
43 |
+
|
44 |
# Function to generate audio for a single sentence
|
45 |
@spaces.GPU(duration=120)
|
46 |
+
def generate_single_wav_from_text(sentence, description):
|
47 |
+
set_seed(SEED)
|
48 |
+
inputs = tts_tokenizer(description.strip(), return_tensors="pt").to(device)
|
49 |
+
prompt = tts_tokenizer(preprocess(sentence), return_tensors="pt").to(device)
|
50 |
+
|
51 |
+
generation = tts_model.generate(
|
52 |
+
input_ids=inputs.input_ids, prompt_input_ids=prompt.input_ids, attention_mask=inputs.attention_mask,
|
53 |
+
prompt_attention_mask=prompt.attention_mask, do_sample=True, temperature=1.0
|
54 |
+
)
|
55 |
audio_arr = generation.cpu().numpy().squeeze()
|
56 |
+
output_file = f"sentence.wav"
|
57 |
+
sf.write(output_file, audio_arr, SAMPLE_RATE)
|
58 |
+
return SAMPLE_RATE, audio_arr
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
|
60 |
+
# Gradio Interface
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
with gr.Blocks() as demo:
|
62 |
with gr.Row():
|
63 |
+
with gr.Column():
|
64 |
pdf_input = gr.File(label="Upload PDF", file_types=['pdf'])
|
65 |
+
description = gr.Textbox(label="Voice Description", lines=2,
|
|
|
|
|
|
|
66 |
value="Old man voice. Monotone voice tune from an old man, with a very close recording that almost has no background noise.")
|
67 |
+
run_button = gr.Button("Generate Audio", variant="primary")
|
68 |
+
with gr.Column():
|
69 |
+
audio_output = gr.Audio(label="Generated Audio")
|
|
|
70 |
|
71 |
+
def handle_process(pdf_input, description):
|
72 |
+
# Extract and process text from PDF
|
73 |
text = pdf_to_text(pdf_input.name)
|
|
|
|
|
|
|
74 |
sentences = split_text_into_sentences(text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
|
76 |
+
for sentence in sentences:
|
77 |
+
# Generate audio for each sentence
|
78 |
+
sample_rate, audio_arr = generate_single_wav_from_text(sentence, description)
|
79 |
+
yield sentence, sample_rate, audio_arr
|
80 |
+
|
81 |
+
def run_pipeline(pdf_input, description):
|
82 |
+
# Stream outputs to Gradio interface
|
83 |
+
for sentence, sample_rate, audio_arr in handle_process(pdf_input, description):
|
84 |
+
gr.Markdown(f"**Sentence**: {sentence}")
|
85 |
+
audio_output.update(value=(sample_rate, audio_arr))
|
86 |
+
yield
|
87 |
|
88 |
+
run_button.click(run_pipeline, inputs=[pdf_input, description], outputs=[audio_output])
|
|
|
|
|
89 |
|
90 |
+
demo.queue()
|
91 |
demo.launch()
|