emirhanno's picture
initial commits
7f5b6cf
raw
history blame
6.03 kB
import gradio as gr
import torch
from transformers import MarianTokenizer, MarianMTModel
from pdf2docx import Converter
from docx import Document
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer
import soundfile as sf
from pydub import AudioSegment
import os
import nltk
from PyPDF2 import PdfReader
import textwrap
# Download the punkt tokenizer for sentence splitting
nltk.download('punkt')
# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Translation function
def translate(source_text, source_lang, target_lang, batch_size=16):
model_name = f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name).to(device)
text_chunks = textwrap.wrap(source_text, 512)
translated_text = ""
for i in range(0, len(text_chunks), batch_size):
text_batch = text_chunks[i:i+batch_size]
input_ids = tokenizer(text_batch, return_tensors="pt", padding=True, truncation=True, max_length=512).input_ids.to(device)
output_ids = model.generate(input_ids, max_new_tokens=512)
for output in output_ids:
output_text = tokenizer.decode(output, skip_special_tokens=True)
translated_text += output_text + " "
return translated_text
# Function to extract text from PDF
def pdf_to_text(pdf_path):
with open(pdf_path, 'rb') as file:
pdf_reader = PdfReader(file)
text = ""
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
text += page.extract_text()
return text
# Load TTS model and tokenizer
tts_model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-large-v1").to(device)
tts_tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-large-v1")
# Function to split text into sentences
def split_text_into_sentences(text):
sentences = nltk.sent_tokenize(text)
return sentences
# Function to generate audio from text
def generate_wav_from_text(prompt, description, output_file_prefix):
input_ids = tts_tokenizer(prompt, return_tensors="pt").input_ids.to(device)
prompt_input_ids = tts_tokenizer(description, return_tensors="pt").input_ids.to(device)
generation = tts_model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
audio_arr = generation.cpu().numpy().squeeze()
output_file = f"{output_file_prefix}.wav"
sf.write(output_file, audio_arr, tts_model.config.sampling_rate)
return output_file
# Function to combine audio files
def combine_wav_files(output_file, *input_files, silence_duration=500):
combined = AudioSegment.empty()
one_second_silence = AudioSegment.silent(duration=silence_duration)
for file in input_files:
audio = AudioSegment.from_wav(file)
combined += audio + one_second_silence
combined.export(output_file, format='wav')
# Function to update target language options based on the source language
def update_target_lang_options(source_lang):
options = {
"en": ["de", "fr", "tr"],
"tr": ["en"],
"de": ["en", "fr"],
"fr": ["en", "de"]
}
return gr.update(choices=options.get(source_lang, []), value=options.get(source_lang, [])[0])
# Main Gradio function
def process_pdf(pdf_file, translate_checkbox, source_lang, target_lang, description):
text = pdf_to_text(pdf_file.name)
# Translate if translation checkbox is selected
if translate_checkbox:
text = translate(text, source_lang, target_lang)
sentences = split_text_into_sentences(text)
audio_files = []
outputs = []
for i, sentence in enumerate(sentences):
output_file_prefix = f"sentence_{i+1}"
audio_file = generate_wav_from_text(sentence, description, output_file_prefix)
audio_files.append(audio_file)
outputs.append((sentence, audio_file))
combined_output_file = "sentences_combined.wav"
combine_wav_files(combined_output_file, *audio_files)
return outputs, combined_output_file
# Gradio interface
with gr.Blocks() as demo:
with gr.Row():
with gr.Column(scale=1):
pdf_input = gr.File(label="Upload PDF", file_types=['pdf'])
translate_checkbox = gr.Checkbox(label="Enable Translation", value=False)
source_lang = gr.Dropdown(choices=["en", "tr", "de", "fr"], label="Source Language", value="en", interactive=True)
target_lang = gr.Dropdown(choices=["tr"], label="Target Language", value="tr", interactive=True)
description = gr.Textbox(label="Voice Description",
value="Old man voice. Monotone voice tune from an old man, with a very close recording that almost has no background noise.")
process_btn = gr.Button("Process")
with gr.Column(scale=2):
output = gr.Dataframe(headers=["Sentence", "Audio"], label="Generated Audio", datatype=["str", "audio"])
combined_audio = gr.Audio(label="Combined Audio with Silence", type="filepath")
def handle_process(pdf_input, translate_checkbox, source_lang, target_lang, description):
return process_pdf(pdf_input, translate_checkbox, source_lang, target_lang, description)
def handle_translation_toggle(translate_checkbox):
if translate_checkbox:
return gr.update(visible=True), gr.update(visible=True)
else:
return gr.update(visible=False), gr.update(visible=False)
translate_checkbox.change(fn=handle_translation_toggle, inputs=translate_checkbox, outputs=[source_lang, target_lang])
source_lang.change(fn=update_target_lang_options, inputs=source_lang, outputs=target_lang)
process_btn.click(handle_process, inputs=[pdf_input, translate_checkbox, source_lang, target_lang, description], outputs=[output, combined_audio])
demo.launch()