File size: 6,032 Bytes
7f5b6cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import gradio as gr
import torch
from transformers import MarianTokenizer, MarianMTModel
from pdf2docx import Converter
from docx import Document
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer
import soundfile as sf
from pydub import AudioSegment
import os
import nltk
from PyPDF2 import PdfReader
import textwrap

# Download the punkt tokenizer for sentence splitting
nltk.download('punkt')

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Translation function
def translate(source_text, source_lang, target_lang, batch_size=16):
    model_name = f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"
    
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name).to(device)
    
    text_chunks = textwrap.wrap(source_text, 512)
    translated_text = ""
    
    for i in range(0, len(text_chunks), batch_size):
        text_batch = text_chunks[i:i+batch_size]
        input_ids = tokenizer(text_batch, return_tensors="pt", padding=True, truncation=True, max_length=512).input_ids.to(device)
        output_ids = model.generate(input_ids, max_new_tokens=512)
        
        for output in output_ids:
            output_text = tokenizer.decode(output, skip_special_tokens=True)
            translated_text += output_text + " "
    
    return translated_text

# Function to extract text from PDF
def pdf_to_text(pdf_path):
    with open(pdf_path, 'rb') as file:
        pdf_reader = PdfReader(file)
        text = ""
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text += page.extract_text()
    return text

# Load TTS model and tokenizer
tts_model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-large-v1").to(device)
tts_tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-large-v1")

# Function to split text into sentences
def split_text_into_sentences(text):
    sentences = nltk.sent_tokenize(text)
    return sentences

# Function to generate audio from text
def generate_wav_from_text(prompt, description, output_file_prefix):
    input_ids = tts_tokenizer(prompt, return_tensors="pt").input_ids.to(device)
    prompt_input_ids = tts_tokenizer(description, return_tensors="pt").input_ids.to(device)
    
    generation = tts_model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
    audio_arr = generation.cpu().numpy().squeeze()
    output_file = f"{output_file_prefix}.wav"
    sf.write(output_file, audio_arr, tts_model.config.sampling_rate)
    return output_file

# Function to combine audio files
def combine_wav_files(output_file, *input_files, silence_duration=500):
    combined = AudioSegment.empty()
    one_second_silence = AudioSegment.silent(duration=silence_duration)
    
    for file in input_files:
        audio = AudioSegment.from_wav(file)
        combined += audio + one_second_silence
    
    combined.export(output_file, format='wav')

# Function to update target language options based on the source language
def update_target_lang_options(source_lang):
    options = {
        "en": ["de", "fr", "tr"],
        "tr": ["en"],
        "de": ["en", "fr"],
        "fr": ["en", "de"]
    }
    return gr.update(choices=options.get(source_lang, []), value=options.get(source_lang, [])[0])

# Main Gradio function
def process_pdf(pdf_file, translate_checkbox, source_lang, target_lang, description):
    text = pdf_to_text(pdf_file.name)
    
    # Translate if translation checkbox is selected
    if translate_checkbox:
        text = translate(text, source_lang, target_lang)
    
    sentences = split_text_into_sentences(text)
    audio_files = []
    outputs = []
    
    for i, sentence in enumerate(sentences):
        output_file_prefix = f"sentence_{i+1}"
        audio_file = generate_wav_from_text(sentence, description, output_file_prefix)
        audio_files.append(audio_file)
        outputs.append((sentence, audio_file))
    
    combined_output_file = "sentences_combined.wav"
    combine_wav_files(combined_output_file, *audio_files)
    
    return outputs, combined_output_file

# Gradio interface
with gr.Blocks() as demo:
    with gr.Row():
        with gr.Column(scale=1):
            pdf_input = gr.File(label="Upload PDF", file_types=['pdf'])
            translate_checkbox = gr.Checkbox(label="Enable Translation", value=False)
            source_lang = gr.Dropdown(choices=["en", "tr", "de", "fr"], label="Source Language", value="en", interactive=True)
            target_lang = gr.Dropdown(choices=["tr"], label="Target Language", value="tr", interactive=True)
            description = gr.Textbox(label="Voice Description", 
                                     value="Old man voice. Monotone voice tune from an old man, with a very close recording that almost has no background noise.")
            process_btn = gr.Button("Process")
        with gr.Column(scale=2):
            output = gr.Dataframe(headers=["Sentence", "Audio"], label="Generated Audio", datatype=["str", "audio"])
            combined_audio = gr.Audio(label="Combined Audio with Silence", type="filepath")

    def handle_process(pdf_input, translate_checkbox, source_lang, target_lang, description):
        return process_pdf(pdf_input, translate_checkbox, source_lang, target_lang, description)

    def handle_translation_toggle(translate_checkbox):
        if translate_checkbox:
            return gr.update(visible=True), gr.update(visible=True)
        else:
            return gr.update(visible=False), gr.update(visible=False)

    translate_checkbox.change(fn=handle_translation_toggle, inputs=translate_checkbox, outputs=[source_lang, target_lang])
    source_lang.change(fn=update_target_lang_options, inputs=source_lang, outputs=target_lang)
    process_btn.click(handle_process, inputs=[pdf_input, translate_checkbox, source_lang, target_lang, description], outputs=[output, combined_audio])

demo.launch()