File size: 6,032 Bytes
7f5b6cf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
import gradio as gr
import torch
from transformers import MarianTokenizer, MarianMTModel
from pdf2docx import Converter
from docx import Document
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer
import soundfile as sf
from pydub import AudioSegment
import os
import nltk
from PyPDF2 import PdfReader
import textwrap
# Download the punkt tokenizer for sentence splitting
nltk.download('punkt')
# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Translation function
def translate(source_text, source_lang, target_lang, batch_size=16):
model_name = f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name).to(device)
text_chunks = textwrap.wrap(source_text, 512)
translated_text = ""
for i in range(0, len(text_chunks), batch_size):
text_batch = text_chunks[i:i+batch_size]
input_ids = tokenizer(text_batch, return_tensors="pt", padding=True, truncation=True, max_length=512).input_ids.to(device)
output_ids = model.generate(input_ids, max_new_tokens=512)
for output in output_ids:
output_text = tokenizer.decode(output, skip_special_tokens=True)
translated_text += output_text + " "
return translated_text
# Function to extract text from PDF
def pdf_to_text(pdf_path):
with open(pdf_path, 'rb') as file:
pdf_reader = PdfReader(file)
text = ""
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
text += page.extract_text()
return text
# Load TTS model and tokenizer
tts_model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-large-v1").to(device)
tts_tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-large-v1")
# Function to split text into sentences
def split_text_into_sentences(text):
sentences = nltk.sent_tokenize(text)
return sentences
# Function to generate audio from text
def generate_wav_from_text(prompt, description, output_file_prefix):
input_ids = tts_tokenizer(prompt, return_tensors="pt").input_ids.to(device)
prompt_input_ids = tts_tokenizer(description, return_tensors="pt").input_ids.to(device)
generation = tts_model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
audio_arr = generation.cpu().numpy().squeeze()
output_file = f"{output_file_prefix}.wav"
sf.write(output_file, audio_arr, tts_model.config.sampling_rate)
return output_file
# Function to combine audio files
def combine_wav_files(output_file, *input_files, silence_duration=500):
combined = AudioSegment.empty()
one_second_silence = AudioSegment.silent(duration=silence_duration)
for file in input_files:
audio = AudioSegment.from_wav(file)
combined += audio + one_second_silence
combined.export(output_file, format='wav')
# Function to update target language options based on the source language
def update_target_lang_options(source_lang):
options = {
"en": ["de", "fr", "tr"],
"tr": ["en"],
"de": ["en", "fr"],
"fr": ["en", "de"]
}
return gr.update(choices=options.get(source_lang, []), value=options.get(source_lang, [])[0])
# Main Gradio function
def process_pdf(pdf_file, translate_checkbox, source_lang, target_lang, description):
text = pdf_to_text(pdf_file.name)
# Translate if translation checkbox is selected
if translate_checkbox:
text = translate(text, source_lang, target_lang)
sentences = split_text_into_sentences(text)
audio_files = []
outputs = []
for i, sentence in enumerate(sentences):
output_file_prefix = f"sentence_{i+1}"
audio_file = generate_wav_from_text(sentence, description, output_file_prefix)
audio_files.append(audio_file)
outputs.append((sentence, audio_file))
combined_output_file = "sentences_combined.wav"
combine_wav_files(combined_output_file, *audio_files)
return outputs, combined_output_file
# Gradio interface
with gr.Blocks() as demo:
with gr.Row():
with gr.Column(scale=1):
pdf_input = gr.File(label="Upload PDF", file_types=['pdf'])
translate_checkbox = gr.Checkbox(label="Enable Translation", value=False)
source_lang = gr.Dropdown(choices=["en", "tr", "de", "fr"], label="Source Language", value="en", interactive=True)
target_lang = gr.Dropdown(choices=["tr"], label="Target Language", value="tr", interactive=True)
description = gr.Textbox(label="Voice Description",
value="Old man voice. Monotone voice tune from an old man, with a very close recording that almost has no background noise.")
process_btn = gr.Button("Process")
with gr.Column(scale=2):
output = gr.Dataframe(headers=["Sentence", "Audio"], label="Generated Audio", datatype=["str", "audio"])
combined_audio = gr.Audio(label="Combined Audio with Silence", type="filepath")
def handle_process(pdf_input, translate_checkbox, source_lang, target_lang, description):
return process_pdf(pdf_input, translate_checkbox, source_lang, target_lang, description)
def handle_translation_toggle(translate_checkbox):
if translate_checkbox:
return gr.update(visible=True), gr.update(visible=True)
else:
return gr.update(visible=False), gr.update(visible=False)
translate_checkbox.change(fn=handle_translation_toggle, inputs=translate_checkbox, outputs=[source_lang, target_lang])
source_lang.change(fn=update_target_lang_options, inputs=source_lang, outputs=target_lang)
process_btn.click(handle_process, inputs=[pdf_input, translate_checkbox, source_lang, target_lang, description], outputs=[output, combined_audio])
demo.launch()
|