Spaces:

udayl
/

NotebookLM-Kokoro_TTS_App

Running

File size: 5,670 Bytes

3ac5c08

# filepath: /Users/udaylunawat/Downloads/Data-Science-Projects/NotebookLM_clone/gradio_app.py
import os
import tempfile
import gradio as gr
from notebook_lm_kokoro import generate_podcast_script, KPipeline
import soundfile as sf
import numpy as np
import ast
import shutil
import warnings
warnings.filterwarnings("ignore")

# A modified version of generate_audio_from_script to accept voice mapping
def generate_audio_from_script_with_voices(script, speaker1_voice, speaker2_voice, output_file):
    voice_map = {"Speaker 1": speaker1_voice, "Speaker 2": speaker2_voice}
    
    # Clean up the script string if needed
    script = script.strip()
    if not script.startswith("[") or not script.endswith("]"):
        print("Invalid transcript format. Expected a list of tuples.")
        return None

    try:
        transcript_list = ast.literal_eval(script)
        if not isinstance(transcript_list, list):
            raise ValueError("Transcript is not a list")

        all_audio_segments = []
        # Process each dialogue entry
        for i, entry in enumerate(transcript_list):
            if not isinstance(entry, tuple) or len(entry) != 2:
                print(f"Skipping invalid entry {i}: {entry}")
                continue

            speaker, dialogue = entry
            chosen_voice = voice_map.get(speaker, "af_heart")
            print(f"Generating audio for {speaker} with voice '{chosen_voice}'...")

            pipeline = KPipeline(lang_code="a")
            generator = pipeline(dialogue, voice=chosen_voice)

            segment_audio = []
            for j, (gs, ps, audio) in enumerate(generator):
                # print(f"{speaker} - Segment {j}: Global Step = {gs}, Partial Step = {ps}")
                segment_audio.append(audio)

            if segment_audio:
                segment_full = np.concatenate(segment_audio, axis=0)
                all_audio_segments.append(segment_full)

        if not all_audio_segments:
            print("No audio segments were generated.")
            return None

        # Add a pause between segments
        sample_rate = 24000
        pause = np.zeros(sample_rate, dtype=np.float32)
        final_audio = all_audio_segments[0]
        for seg in all_audio_segments[1:]:
            final_audio = np.concatenate((final_audio, pause, seg), axis=0)

        sf.write(output_file, final_audio, sample_rate)
        print(f"Saved final audio as {output_file}")
        return output_file

    except Exception as e:
        print(f"Error processing transcript: {e}")
        return None


def process_pdf(pdf_file, speaker1_voice, speaker2_voice, provider):
    """Process the uploaded PDF file and generate audio"""
    try:
        # Check if we received a valid file
        if pdf_file is None:
            return "No file uploaded", None
            
        # Create a temporary file with .pdf extension
        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
            # For Gradio uploads, we need to copy the file
            shutil.copy2(pdf_file.name, tmp.name)
            tmp_path = tmp.name
            
        print(f"Uploaded PDF saved at {tmp_path}")

        # Generate transcript using your existing function
        transcript, transcript_path = generate_podcast_script(tmp_path, provider=provider)
        if transcript is None:
            return "Error generating transcript", None

        # Define an output file path for the generated audio
        audio_output_path = os.path.join(
            os.path.dirname(tmp_path),
            f"audio_{os.path.basename(tmp_path).replace('.pdf', '.wav')}"
        )
        
        result = generate_audio_from_script_with_voices(
            transcript, 
            speaker1_voice, 
            speaker2_voice, 
            output_file=audio_output_path
        )
        
        if result is None:
            return "Error generating audio", None
        
        return "Process complete!", result

    except Exception as e:
        print(f"Error in process_pdf: {str(e)}")
        return f"Error processing file: {str(e)}", None


def create_gradio_app():
    with gr.Blocks() as app:
        gr.Markdown("# NotebookLM-Kokoro TTS App")
        gr.Markdown("Upload a PDF, choose voices, and generate TTS audio using Kokoro.")
        
        with gr.Row():
            pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
        
        with gr.Row():
            with gr.Column():
                speaker1_voice = gr.Dropdown(
                    choices=["af_heart", "af_bella", "hf_beta"],
                    value="af_heart",
                    label="Speaker 1 Voice"
                )
                speaker2_voice = gr.Dropdown(
                    choices=["af_nicole", "af_heart", "bf_emma"],
                    value="af_nicole",
                    label="Speaker 2 Voice"
                )
                provider = gr.Radio(
                    choices=["openai", "openrouter"],
                    value="openrouter",
                    label="API Provider (TTS Script Generation)"
                )
                submit_btn = gr.Button("Generate Audio")
        
        with gr.Row():
            status_output = gr.Textbox(label="Status")
            audio_output = gr.Audio(label="Generated Audio", type="filepath")
        
        submit_btn.click(
            fn=process_pdf,
            inputs=[pdf_input, speaker1_voice, speaker2_voice, provider],
            outputs=[status_output, audio_output]
        )
    
    return app


if __name__ == "__main__":
    demo = create_gradio_app()
    demo.launch(share=True)  # add share=True to get a public URL