|
import gradio as gr |
|
import torch |
|
from transformers import MarianTokenizer, MarianMTModel |
|
from pdf2docx import Converter |
|
from docx import Document |
|
from parler_tts import ParlerTTSForConditionalGeneration |
|
from transformers import AutoTokenizer |
|
import soundfile as sf |
|
from pydub import AudioSegment |
|
import os |
|
import nltk |
|
from PyPDF2 import PdfReader |
|
import textwrap |
|
|
|
|
|
nltk.download('punkt') |
|
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
|
|
|
def translate(source_text, source_lang, target_lang, batch_size=16): |
|
model_name = f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}" |
|
|
|
tokenizer = MarianTokenizer.from_pretrained(model_name) |
|
model = MarianMTModel.from_pretrained(model_name).to(device) |
|
|
|
text_chunks = textwrap.wrap(source_text, 512) |
|
translated_text = "" |
|
|
|
for i in range(0, len(text_chunks), batch_size): |
|
text_batch = text_chunks[i:i+batch_size] |
|
input_ids = tokenizer(text_batch, return_tensors="pt", padding=True, truncation=True, max_length=512).input_ids.to(device) |
|
output_ids = model.generate(input_ids, max_new_tokens=512) |
|
|
|
for output in output_ids: |
|
output_text = tokenizer.decode(output, skip_special_tokens=True) |
|
translated_text += output_text + " " |
|
|
|
return translated_text |
|
|
|
|
|
def pdf_to_text(pdf_path): |
|
with open(pdf_path, 'rb') as file: |
|
pdf_reader = PdfReader(file) |
|
text = "" |
|
for page_num in range(len(pdf_reader.pages)): |
|
page = pdf_reader.pages[page_num] |
|
text += page.extract_text() |
|
return text |
|
|
|
|
|
tts_model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-large-v1").to(device) |
|
tts_tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-large-v1") |
|
|
|
|
|
def split_text_into_sentences(text): |
|
sentences = nltk.sent_tokenize(text) |
|
return sentences |
|
|
|
|
|
def generate_wav_from_text(prompt, description, output_file_prefix): |
|
input_ids = tts_tokenizer(prompt, return_tensors="pt").input_ids.to(device) |
|
prompt_input_ids = tts_tokenizer(description, return_tensors="pt").input_ids.to(device) |
|
|
|
generation = tts_model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids) |
|
audio_arr = generation.cpu().numpy().squeeze() |
|
output_file = f"{output_file_prefix}.wav" |
|
sf.write(output_file, audio_arr, tts_model.config.sampling_rate) |
|
return output_file |
|
|
|
|
|
def combine_wav_files(output_file, *input_files, silence_duration=500): |
|
combined = AudioSegment.empty() |
|
one_second_silence = AudioSegment.silent(duration=silence_duration) |
|
|
|
for file in input_files: |
|
audio = AudioSegment.from_wav(file) |
|
combined += audio + one_second_silence |
|
|
|
combined.export(output_file, format='wav') |
|
|
|
|
|
def update_target_lang_options(source_lang): |
|
options = { |
|
"en": ["de", "fr", "tr"], |
|
"tr": ["en"], |
|
"de": ["en", "fr"], |
|
"fr": ["en", "de"] |
|
} |
|
return gr.update(choices=options.get(source_lang, []), value=options.get(source_lang, [])[0]) |
|
|
|
|
|
def process_pdf(pdf_file, translate_checkbox, source_lang, target_lang, description): |
|
text = pdf_to_text(pdf_file.name) |
|
|
|
|
|
if translate_checkbox: |
|
text = translate(text, source_lang, target_lang) |
|
|
|
sentences = split_text_into_sentences(text) |
|
audio_files = [] |
|
outputs = [] |
|
|
|
for i, sentence in enumerate(sentences): |
|
output_file_prefix = f"sentence_{i+1}" |
|
audio_file = generate_wav_from_text(sentence, description, output_file_prefix) |
|
audio_files.append(audio_file) |
|
outputs.append((sentence, audio_file)) |
|
|
|
combined_output_file = "sentences_combined.wav" |
|
combine_wav_files(combined_output_file, *audio_files) |
|
|
|
return outputs, combined_output_file |
|
|
|
|
|
with gr.Blocks() as demo: |
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
pdf_input = gr.File(label="Upload PDF", file_types=['pdf']) |
|
translate_checkbox = gr.Checkbox(label="Enable Translation", value=False) |
|
source_lang = gr.Dropdown(choices=["en", "tr", "de", "fr"], label="Source Language", value="en", interactive=True) |
|
target_lang = gr.Dropdown(choices=["tr"], label="Target Language", value="tr", interactive=True) |
|
description = gr.Textbox(label="Voice Description", |
|
value="Old man voice. Monotone voice tune from an old man, with a very close recording that almost has no background noise.") |
|
process_btn = gr.Button("Process") |
|
with gr.Column(scale=2): |
|
output = gr.Dataframe(headers=["Sentence", "Audio"], label="Generated Audio", datatype=["str", "audio"]) |
|
combined_audio = gr.Audio(label="Combined Audio with Silence", type="filepath") |
|
|
|
def handle_process(pdf_input, translate_checkbox, source_lang, target_lang, description): |
|
return process_pdf(pdf_input, translate_checkbox, source_lang, target_lang, description) |
|
|
|
def handle_translation_toggle(translate_checkbox): |
|
if translate_checkbox: |
|
return gr.update(visible=True), gr.update(visible=True) |
|
else: |
|
return gr.update(visible=False), gr.update(visible=False) |
|
|
|
translate_checkbox.change(fn=handle_translation_toggle, inputs=translate_checkbox, outputs=[source_lang, target_lang]) |
|
source_lang.change(fn=update_target_lang_options, inputs=source_lang, outputs=target_lang) |
|
process_btn.click(handle_process, inputs=[pdf_input, translate_checkbox, source_lang, target_lang, description], outputs=[output, combined_audio]) |
|
|
|
demo.launch() |
|
|