# filepath: /Users/udaylunawat/Downloads/Data-Science-Projects/NotebookLM_clone/gradio_app.py import os import tempfile import gradio as gr from notebook_lm_kokoro import generate_podcast_script, KPipeline import soundfile as sf import numpy as np import ast import shutil import warnings import os import gradio as gr from notebook_lm_kokoro import generate_podcast_script, generate_audio_from_script warnings.filterwarnings("ignore") # A modified version of generate_audio_from_script to accept voice mapping def generate_audio_from_script_with_voices(script, speaker1_voice, speaker2_voice, output_file): voice_map = {"Speaker 1": speaker1_voice, "Speaker 2": speaker2_voice} # Clean up the script string if needed script = script.strip() if not script.startswith("[") or not script.endswith("]"): print("Invalid transcript format. Expected a list of tuples.") return None try: transcript_list = ast.literal_eval(script) if not isinstance(transcript_list, list): raise ValueError("Transcript is not a list") all_audio_segments = [] # Process each dialogue entry for i, entry in enumerate(transcript_list): if not isinstance(entry, tuple) or len(entry) != 2: print(f"Skipping invalid entry {i}: {entry}") continue speaker, dialogue = entry chosen_voice = voice_map.get(speaker, "af_heart") print(f"Generating audio for {speaker} with voice '{chosen_voice}'...") pipeline = KPipeline(lang_code="a") generator = pipeline(dialogue, voice=chosen_voice) segment_audio = [] for j, (gs, ps, audio) in enumerate(generator): # print(f"{speaker} - Segment {j}: Global Step = {gs}, Partial Step = {ps}") segment_audio.append(audio) if segment_audio: segment_full = np.concatenate(segment_audio, axis=0) all_audio_segments.append(segment_full) if not all_audio_segments: print("No audio segments were generated.") return None # Add a pause between segments sample_rate = 24000 pause = np.zeros(sample_rate, dtype=np.float32) final_audio = all_audio_segments[0] for seg in all_audio_segments[1:]: final_audio = np.concatenate((final_audio, pause, seg), axis=0) sf.write(output_file, final_audio, sample_rate) print(f"Saved final audio as {output_file}") return output_file except Exception as e: print(f"Error processing transcript: {e}") return None def process_pdf(pdf_file, speaker1_voice, speaker2_voice, provider, api_key, openrouter_base=None): """Process the uploaded PDF file and generate audio""" try: # Set API configuration based on provider if provider == "openai": os.environ["OPENAI_API_KEY"] = api_key os.environ["OPENROUTER_API_BASE"] = "https://api.openai.com/v1" else: os.environ["OPENAI_API_KEY"] = api_key os.environ["OPENROUTER_API_BASE"] = openrouter_base or "https://openrouter.ai/api/v1" # Check if we received a valid file if pdf_file is None: return "No file uploaded", None # Create a temporary file with .pdf extension with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp: # For Gradio uploads, we need to copy the file shutil.copy2(pdf_file.name, tmp.name) tmp_path = tmp.name print(f"Uploaded PDF saved at {tmp_path}") # Generate transcript using your existing function transcript, transcript_path = generate_podcast_script(tmp_path, provider=provider) if transcript is None: return "Error generating transcript", None # Define an output file path for the generated audio audio_output_path = os.path.join( os.path.dirname(tmp_path), f"audio_{os.path.basename(tmp_path).replace('.pdf', '.wav')}" ) result = generate_audio_from_script_with_voices( transcript, speaker1_voice, speaker2_voice, output_file=audio_output_path ) if result is None: return "Error generating audio", None return "Process complete!", result except Exception as e: print(f"Error in process_pdf: {str(e)}") return f"Error processing file: {str(e)}", None def create_gradio_app(): # Add CSS for better styling css = """ .gradio-container {max-width: 900px !important} """ with gr.Blocks(css=css, theme=gr.themes.Soft()) as app: gr.Markdown( """ # 📚 NotebookLM-Kokoro TTS App Upload a PDF, choose voices, and generate conversational audio using Kokoro TTS. """ ) with gr.Row(): with gr.Column(scale=2): pdf_input = gr.File( label="Upload PDF Document", file_types=[".pdf"], type="filepath" ) with gr.Row(): speaker1_voice = gr.Dropdown( choices=["af_heart", "af_bella", "hf_beta"], value="af_heart", label="Speaker 1 Voice" ) speaker2_voice = gr.Dropdown( choices=["af_nicole", "af_heart", "bf_emma"], value="af_nicole", label="Speaker 2 Voice" ) provider = gr.Radio( choices=["openai", "openrouter"], value="openrouter", label="API Provider" ) with gr.Group(): provider = gr.Radio( choices=["openai", "openrouter"], value="openrouter", label="API Provider" ) api_key = gr.Textbox( label="API Key", placeholder="Enter your API key here...", type="password", class_name="api-key-input" ) openrouter_base = gr.Textbox( label="OpenRouter Base URL (optional)", placeholder="https://openrouter.ai/api/v1", visible=False ) # Show/hide OpenRouter base URL based on provider selection def toggle_openrouter_base(provider_choice): return gr.update(visible=provider_choice == "openrouter") provider.change( fn=toggle_openrouter_base, inputs=[provider], outputs=[openrouter_base] ) submit_btn = gr.Button("🎙️ Generate Audio", variant="primary") with gr.Column(scale=2): status_output = gr.Textbox( label="Status", placeholder="Processing status will appear here..." ) audio_output = gr.Audio( label="Generated Audio", type="filepath" ) # Examples section gr.Examples( examples=[ ["sample.pdf", "af_heart", "af_nicole", "openrouter"], ], inputs=[pdf_input, speaker1_voice, speaker2_voice, provider], outputs=[status_output, audio_output], fn=process_pdf, cache_examples=True, ) submit_btn.click( fn=process_pdf, inputs=[pdf_input, speaker1_voice, speaker2_voice, provider], outputs=[status_output, audio_output], api_name="generate" ) gr.Markdown( """ ### 📝 Notes - Make sure your PDF is readable and contains text (not scanned images) - Processing large PDFs may take a few minutes - You need a valid OpenAI/OpenRouter API key set as environment variable """ ) return app if __name__ == "__main__": demo = create_gradio_app() demo.queue(concurrency_count=1).launch( server_name="0.0.0.0", server_port=7860, share=True, debug=True )