Spaces:

Agents-MCP-Hackathon
/

Narrated_Slideshow_Generator

Sleeping

App Files Files Community

cp557 commited on Jun 10

Commit

0335934

verified ·

1 Parent(s): 1f9435a

Update generate_slideshow.py

Browse files

Files changed (1) hide show

generate_slideshow.py +67 -9

generate_slideshow.py CHANGED Viewed

@@ -22,8 +22,24 @@ from google import genai
 from google.genai import types
 from PIL import Image
 GEMINI_API_KEY = os.environ.get("GEMINI_KEY")
 # Dictionary to store temporary directories for cleanup
 _temp_dirs: Dict[str, str] = {}
@@ -214,9 +230,9 @@ Include:
 For each slide provide:
 1. Each title should be a single concise and coherent phrase accompanied by exactly one relevant emoji. (Do NOT use the colon ":" format for titles)
-2. 3-4 concise bullet points
 3. Clear prose speaker notes suitable for narration that is accessible to general audiences
-4. A detailed and specific image prompt for an AI image generator that is relevent to the slide's content
 Respond with a JSON array where each element represents a slide in the following format:
@@ -271,6 +287,7 @@ async def _generate_tts(narration: str, out_path: Path):
         pass
     # Try models in sequence until one works
     for model in models_to_try:
         try:
             print(f"Attempting TTS with model: {model}")
@@ -306,12 +323,12 @@ async def _generate_tts(narration: str, out_path: Path):
                             )
                             with open(out_path, "ab") as f:
                                 f.write(data)
-                await process_stream()
-                # If we get here, the model worked successfully
-                print(f"Successfully generated TTS using model: {model}")
-                return
         except Exception as e:
             if hasattr(e, 'code') and getattr(e, 'code', None) == 429:
@@ -322,12 +339,53 @@ async def _generate_tts(narration: str, out_path: Path):
                 print(f"Error with model {model}: {e}")
                 raise
-    # If we've tried all models and none worked
     print("All TTS models quota exhausted. Creating empty audio file.")
     with open(out_path, "wb") as f:
         f.write(b'RIFF$\x00\x00\x00WAVEfmt \x10\x00\x00\x00\x01\x00\x01\x00\x00\x04\x00\x00\x00\x04\x00\x00\x01\x00\x08\x00data\x00\x00\x00\x00')
 # ──────────────────────── Public Entry Point ──────────────────��
 async def generate_slideshow_with_audio_async(topic: str, **kwargs):
     """

 from google.genai import types
 from PIL import Image
+# Deepgram imports for TTS fallback
+try:
+    from deepgram import DeepgramClient
+    # Try different import paths based on SDK version
+    try:
+        from deepgram.clients.speak.v1.speak_client import SpeakOptions
+    except ImportError:
+        try:
+            from deepgram.clients.speak.v1 import SpeakOptions
+        except ImportError:
+            from deepgram.clients.speak import SpeakOptions
+    DEEPGRAM_AVAILABLE = True
+except ImportError:
+    print("Deepgram SDK not available. Install with 'pip install deepgram-sdk'")
+    DEEPGRAM_AVAILABLE = False
 GEMINI_API_KEY = os.environ.get("GEMINI_KEY")
+DEEPGRAM_KEY = os.environ.get("DEEPGRAM_KEY")
 # Dictionary to store temporary directories for cleanup
 _temp_dirs: Dict[str, str] = {}
 For each slide provide:
 1. Each title should be a single concise and coherent phrase accompanied by exactly one relevant emoji. (Do NOT use the colon ":" format for titles)
+2. 3-4 concise bullet points, you will go into more detail in the speaker notes.
 3. Clear prose speaker notes suitable for narration that is accessible to general audiences
+4. A detailed and specific image prompt for an AI image generator that is relevent to the slide's content. Do not include any text in the image.
 Respond with a JSON array where each element represents a slide in the following format:
         pass
     # Try models in sequence until one works
+    gemini_exhausted = True
     for model in models_to_try:
         try:
             print(f"Attempting TTS with model: {model}")
                             )
                             with open(out_path, "ab") as f:
                                 f.write(data)
+            await process_stream()
+            # If we get here, the model worked successfully
+            print(f"Successfully generated TTS using model: {model}")
+            gemini_exhausted = False
+            return
         except Exception as e:
             if hasattr(e, 'code') and getattr(e, 'code', None) == 429:
                 print(f"Error with model {model}: {e}")
                 raise
+    # If we've tried all Gemini models and none worked, try Deepgram
+    if gemini_exhausted and DEEPGRAM_AVAILABLE and DEEPGRAM_KEY:
+        try:
+            print("Attempting TTS with Deepgram...")
+            # Run Deepgram in executor to avoid blocking
+            loop = asyncio.get_event_loop()
+            await loop.run_in_executor(None, lambda: _generate_tts_with_deepgram(narration, out_path))
+            print("Successfully generated TTS using Deepgram")
+            return
+        except Exception as e:
+            print(f"Error with Deepgram TTS: {e}")
+            # Continue to fallback empty WAV if Deepgram fails
+    # Last resort fallback - create empty audio file
     print("All TTS models quota exhausted. Creating empty audio file.")
     with open(out_path, "wb") as f:
         f.write(b'RIFF$\x00\x00\x00WAVEfmt \x10\x00\x00\x00\x01\x00\x01\x00\x00\x04\x00\x00\x00\x04\x00\x00\x01\x00\x08\x00data\x00\x00\x00\x00')
+def _generate_tts_with_deepgram(narration: str, out_path: Path):
+    """Generate TTS using Deepgram API"""
+    # Initialize the Deepgram client
+    deepgram = DeepgramClient(DEEPGRAM_KEY)
+    print(f"Using Deepgram for TTS generation")
+    # Configure speech options for v2.x API (which we confirmed works)
+    options = SpeakOptions(
+        model="aura-2-thalia-en",  # Use Thalia voice
+        encoding="linear16",      # This produces WAV format
+        container="wav",         # Specify WAV container
+        sample_rate=24000        # Sample rate in Hz
+    )
+    # Convert text to speech and save directly to file using the v2.x API
+    try:
+        response = deepgram.speak.rest.v("1").save(
+            str(out_path),          # Output filename
+            {"text": narration},    # Text to convert
+            options
+        )
+        print(f"Successfully generated TTS with Deepgram: {out_path}")
+        return response
+    except Exception as e:
+        print(f"Error generating TTS with Deepgram: {e}")
+        raise
 # ──────────────────────── Public Entry Point ──────────────────��
 async def generate_slideshow_with_audio_async(topic: str, **kwargs):
     """