Spaces:

emirhanbilgic
/

read-my-pdf-outloud

Running

App Files Files Community

emirhanbilgic commited on Aug 11, 2024

Commit

23f3f75

verified ·

1 Parent(s): 347bb89

Update app.py

Browse files

Files changed (1) hide show

app.py +49 -109

app.py CHANGED Viewed

@@ -1,47 +1,24 @@
-import spaces  # Import this first to prevent CUDA initialization before spaces
 import gradio as gr
 import torch
 from transformers import MarianTokenizer, MarianMTModel
 from parler_tts import ParlerTTSForConditionalGeneration
-from transformers import AutoTokenizer
-import soundfile as sf
-from pydub import AudioSegment
-import os
-import re
 from PyPDF2 import PdfReader
 import textwrap
 # Device configuration
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-# Initialize models and tokenizers outside the functions
 tts_model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-large-v1").to(device)
 tts_tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-large-v1")
-# Translation function
-@spaces.GPU(duration=120)
-def translate(source_text, source_lang, target_lang, batch_size=16):
-    model_name = f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"
-    tokenizer = MarianTokenizer.from_pretrained(model_name)
-    model = MarianMTModel.from_pretrained(model_name).to(device)
-    text_chunks = textwrap.wrap(source_text, 512)
-    translated_text = ""
-    for i in range(0, len(text_chunks), batch_size):
-        text_batch = text_chunks[i:i+batch_size]
-        input_ids = tokenizer(text_batch, return_tensors="pt", padding=True, truncation=True, max_length=512).input_ids.to(device)
-        output_ids = model.generate(input_ids, max_new_tokens=512)
-        for output in output_ids:
-            output_text = tokenizer.decode(output, skip_special_tokens=True)
-            translated_text += output_text + " "
-    return translated_text
-# Function to extract text from PDF
 def pdf_to_text(pdf_path):
     with open(pdf_path, 'rb') as file:
         pdf_reader = PdfReader(file)
@@ -51,101 +28,64 @@ def pdf_to_text(pdf_path):
             text += page.extract_text()
     return text
-# Function to split text into sentences using regex
 def split_text_into_sentences(text):
     sentence_endings = re.compile(r'[.!?]')
     sentences = sentence_endings.split(text)
     return [sentence.strip() for sentence in sentences if sentence.strip()]
 # Function to generate audio for a single sentence
 @spaces.GPU(duration=120)
-def generate_single_wav_from_text(sentence, description, output_file_prefix, tts_model, tts_tokenizer):
-    input_ids = tts_tokenizer(sentence, return_tensors="pt").input_ids.to(device)
-    prompt_input_ids = tts_tokenizer(description, return_tensors="pt").input_ids.to(device)
-    generation = tts_model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
     audio_arr = generation.cpu().numpy().squeeze()
-    output_file = f"{output_file_prefix}.wav"
-    sf.write(output_file, audio_arr, tts_model.config.sampling_rate)
-    return output_file
-# Function to combine audio files
-def combine_wav_files(output_file, *input_files, silence_duration=500):
-    combined = AudioSegment.empty()
-    one_second_silence = AudioSegment.silent(duration=silence_duration)
-    for file in input_files:
-        audio = AudioSegment.from_wav(file)
-        combined += audio + one_second_silence
-    combined.export(output_file, format='wav')
-# Function to update target language options based on the source language
-def update_target_lang_options(source_lang):
-    options = {
-        "en": ["de", "fr", "tr"],
-        "tr": ["en"],
-        "de": ["en", "fr"],
-        "fr": ["en", "de"]
-    }
-    return gr.update(choices=options.get(source_lang, []), value=options.get(source_lang, [])[0])
-# Function to process sentences for audio generation
-def process_sentences_for_audio(sentences, description, tts_model, tts_tokenizer):
-    audio_files = []
-    for i, sentence in enumerate(sentences):
-        print(f"Generating audio for sentence {i+1}...")
-        output_file_prefix = f"sentence_{i+1}"
-        audio_file = generate_single_wav_from_text(sentence, description, output_file_prefix, tts_model, tts_tokenizer)
-        audio_files.append(audio_file)
-        yield sentence, audio_file
-    combined_output_file = "sentences_combined.wav"
-    combine_wav_files(combined_output_file, *audio_files)
-    yield None, combined_output_file
-# Gradio interface
 with gr.Blocks() as demo:
     with gr.Row():
-        with gr.Column(scale=1):
             pdf_input = gr.File(label="Upload PDF", file_types=['pdf'])
-            translate_checkbox = gr.Checkbox(label="Enable Translation", value=False)
-            source_lang = gr.Dropdown(choices=["en", "tr", "de", "fr"], label="Source Language", value="en", interactive=True)
-            target_lang = gr.Dropdown(choices=["tr"], label="Target Language", value="tr", interactive=True)
-            description = gr.Textbox(label="Voice Description",
                                      value="Old man voice. Monotone voice tune from an old man, with a very close recording that almost has no background noise.")
-            process_btn = gr.Button("Process")
-        with gr.Column(scale=2):
-            gr.Markdown("### Generated Audio")
-            output_group = gr.Group()
-    def handle_process(pdf_input, translate_checkbox, source_lang, target_lang, description):
         text = pdf_to_text(pdf_input.name)
-        if translate_checkbox:
-            text = translate(text, source_lang, target_lang)
         sentences = split_text_into_sentences(text)
-        for sentence, audio_file in process_sentences_for_audio(sentences, description, tts_model, tts_tokenizer):
-            if sentence:
-                with output_group:
-                    gr.Markdown(f"**Sentence**: {sentence}")
-                    gr.Audio(value=audio_file, label=sentence)
-            else:
-                with output_group:
-                    gr.Markdown("### Combined Audio")
-                    gr.Audio(value=audio_file, label="Combined Audio")
-    def handle_translation_toggle(translate_checkbox):
-        if translate_checkbox:
-            return gr.update(visible=True), gr.update(visible=True)
-        else:
-            return gr.update(visible=False), gr.update(visible=False)
-    translate_checkbox.change(fn=handle_translation_toggle, inputs=translate_checkbox, outputs=[source_lang, target_lang])
-    source_lang.change(fn=update_target_lang_options, inputs=source_lang, outputs=target_lang)
-    process_btn.click(handle_process, inputs=[pdf_input, translate_checkbox, source_lang, target_lang, description], outputs=[])
 demo.launch()

+import spaces
 import gradio as gr
 import torch
 from transformers import MarianTokenizer, MarianMTModel
 from parler_tts import ParlerTTSForConditionalGeneration
+from transformers import AutoTokenizer, set_seed
 from PyPDF2 import PdfReader
+import re
 import textwrap
+import soundfile as sf
 # Device configuration
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# Initialize models and tokenizers
 tts_model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-large-v1").to(device)
 tts_tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-large-v1")
+SAMPLE_RATE = 22050  # Adjust as needed
+SEED = 42
+# Helper function to extract text from a PDF
 def pdf_to_text(pdf_path):
     with open(pdf_path, 'rb') as file:
         pdf_reader = PdfReader(file)
             text += page.extract_text()
     return text
+# Helper function to split text into sentences using regex
 def split_text_into_sentences(text):
     sentence_endings = re.compile(r'[.!?]')
     sentences = sentence_endings.split(text)
     return [sentence.strip() for sentence in sentences if sentence.strip()]
+# Helper function to preprocess the text (normalization, punctuation)
+def preprocess(text):
+    text = text.replace("-", " ")
+    if text[-1] not in ".!?":
+        text += "."
+    return text
 # Function to generate audio for a single sentence
 @spaces.GPU(duration=120)
+def generate_single_wav_from_text(sentence, description):
+    set_seed(SEED)
+    inputs = tts_tokenizer(description.strip(), return_tensors="pt").to(device)
+    prompt = tts_tokenizer(preprocess(sentence), return_tensors="pt").to(device)
+    generation = tts_model.generate(
+        input_ids=inputs.input_ids, prompt_input_ids=prompt.input_ids, attention_mask=inputs.attention_mask,
+        prompt_attention_mask=prompt.attention_mask, do_sample=True, temperature=1.0
+    )
     audio_arr = generation.cpu().numpy().squeeze()
+    output_file = f"sentence.wav"
+    sf.write(output_file, audio_arr, SAMPLE_RATE)
+    return SAMPLE_RATE, audio_arr
+# Gradio Interface
 with gr.Blocks() as demo:
     with gr.Row():
+        with gr.Column():
             pdf_input = gr.File(label="Upload PDF", file_types=['pdf'])
+            description = gr.Textbox(label="Voice Description", lines=2,
                                      value="Old man voice. Monotone voice tune from an old man, with a very close recording that almost has no background noise.")
+            run_button = gr.Button("Generate Audio", variant="primary")
+        with gr.Column():
+            audio_output = gr.Audio(label="Generated Audio")
+    def handle_process(pdf_input, description):
+        # Extract and process text from PDF
         text = pdf_to_text(pdf_input.name)
         sentences = split_text_into_sentences(text)
+        for sentence in sentences:
+            # Generate audio for each sentence
+            sample_rate, audio_arr = generate_single_wav_from_text(sentence, description)
+            yield sentence, sample_rate, audio_arr
+    def run_pipeline(pdf_input, description):
+        # Stream outputs to Gradio interface
+        for sentence, sample_rate, audio_arr in handle_process(pdf_input, description):
+            gr.Markdown(f"**Sentence**: {sentence}")
+            audio_output.update(value=(sample_rate, audio_arr))
+            yield
+    run_button.click(run_pipeline, inputs=[pdf_input, description], outputs=[audio_output])
+demo.queue()
 demo.launch()