Update generate_slideshow.py
Browse files- generate_slideshow.py +23 -25
generate_slideshow.py
CHANGED
@@ -1,9 +1,8 @@
|
|
1 |
#!/usr/bin/env python3
|
2 |
"""
|
3 |
Generates slide markdown plus TTS audio and images using Gemini models.
|
4 |
-
|
5 |
Functions exposed:
|
6 |
-
generate_slideshow_with_audio(topic) -> (list_of_slide_markdown, list_of_audio_paths, list_of_image_paths)
|
7 |
"""
|
8 |
|
9 |
import asyncio
|
@@ -38,7 +37,7 @@ except ImportError:
|
|
38 |
print("Deepgram SDK not available. Install with 'pip install deepgram-sdk'")
|
39 |
DEEPGRAM_AVAILABLE = False
|
40 |
|
41 |
-
|
42 |
DEEPGRAM_KEY = os.environ.get("DEEPGRAM_KEY")
|
43 |
|
44 |
# Dictionary to store temporary directories for cleanup
|
@@ -175,9 +174,9 @@ def _extract_markdown_slides(markdown: str) -> list[dict]:
|
|
175 |
|
176 |
|
177 |
# ββββββββββββββββββββββββββββ Gemini Calls βββββββββββββββββββββββββββ
|
178 |
-
async def _generate_image(prompt: str, output_path: Path) -> str:
|
179 |
"""Generate an image using Gemini Imagen model and save it to the specified path."""
|
180 |
-
client = genai.Client(api_key=
|
181 |
|
182 |
try:
|
183 |
# Make this call in a separate thread to not block the event loop
|
@@ -210,9 +209,9 @@ async def _generate_image(prompt: str, output_path: Path) -> str:
|
|
210 |
print(f"Error generating image: {e}")
|
211 |
return ""
|
212 |
|
213 |
-
def _generate_slideshow_markdown(topic: str) -> str:
|
214 |
"""Ask Gemini 2.5 Flash for a markdown deck following strict rules."""
|
215 |
-
client = genai.Client(api_key=
|
216 |
#model = "gemini-2.5-flash-preview-05-20"
|
217 |
model = "gemini-2.5-pro-preview-06-05"
|
218 |
|
@@ -220,21 +219,17 @@ def _generate_slideshow_markdown(topic: str) -> str:
|
|
220 |
<role>
|
221 |
You are SlideGen, an AI that creates fun and engaging narrated slide decks with visual elements about various topics.
|
222 |
</role>
|
223 |
-
|
224 |
<instructions>
|
225 |
Create a presentation about '{topic}'.
|
226 |
Include:
|
227 |
- An introduction slide with bullet points about the overview of the presentation topic and the key areas that will be covered
|
228 |
- 3 content slides with bullet points
|
229 |
- A conclusion slide with bullet points summarizing the key points and insights.
|
230 |
-
|
231 |
For each slide provide:
|
232 |
1. Each title should be a single concise and coherent phrase accompanied by exactly one relevant emoji. (Do NOT use the colon ":" format for titles)
|
233 |
2. 3-4 concise bullet points, you will go into more detail in the speaker notes.
|
234 |
3. Clear prose speaker notes suitable for narration that is accessible to general audiences
|
235 |
4. A detailed and specific image prompt for an AI image generator that is relevent to the slide's content. Do not include any text in the image.
|
236 |
-
|
237 |
-
|
238 |
Respond with a JSON array where each element represents a slide in the following format:
|
239 |
```json
|
240 |
[
|
@@ -275,9 +270,9 @@ Respond with a JSON array where each element represents a slide in the following
|
|
275 |
return response.text.strip()
|
276 |
|
277 |
|
278 |
-
async def _generate_tts(narration: str, out_path: Path):
|
279 |
"""GenAI TTS β WAV - Async version with fallback model support"""
|
280 |
-
client = genai.Client(api_key=
|
281 |
|
282 |
# Try with flash model first, then fall back to pro model if needed
|
283 |
models_to_try = ["gemini-2.5-flash-preview-tts", "gemini-2.5-pro-preview-06-05"]
|
@@ -387,17 +382,22 @@ def _generate_tts_with_deepgram(narration: str, out_path: Path):
|
|
387 |
|
388 |
|
389 |
# ββββββββββββββββββββββββ Public Entry Point βββββββββββββββββββ
|
390 |
-
async def generate_slideshow_with_audio_async(topic: str, **kwargs):
|
391 |
"""
|
392 |
Async version of generate_slideshow_with_audio that processes slides concurrently.
|
393 |
|
|
|
|
|
|
|
|
|
|
|
394 |
Returns:
|
395 |
slides_md : list[str] β markdown for each slide
|
396 |
audio : list[str] β file paths (one per slide, same order)
|
397 |
images : list[str|None] β file paths for slide images (one per slide, same order)
|
398 |
"""
|
399 |
# Get JSON response from Gemini
|
400 |
-
json_response = _generate_slideshow_markdown(topic)
|
401 |
|
402 |
# Parse JSON into slides data
|
403 |
slides_data = _parse_slides_json(json_response)
|
@@ -439,7 +439,7 @@ async def generate_slideshow_with_audio_async(topic: str, **kwargs):
|
|
439 |
# Schedule TTS task
|
440 |
if narration:
|
441 |
print(f"Scheduling TTS for slide {i} -> {wav_path}")
|
442 |
-
tts_tasks.append(_generate_tts(narration, wav_path))
|
443 |
else:
|
444 |
# Create empty placeholder WAV if no narration
|
445 |
with open(wav_path, "wb") as f:
|
@@ -457,7 +457,7 @@ async def generate_slideshow_with_audio_async(topic: str, **kwargs):
|
|
457 |
image_path = pres_dir / f"{safe_topic}_slide_{i:02d}_image.jpg"
|
458 |
print(f"Scheduling image for slide {i} -> {image_path}")
|
459 |
# Store task with index to track which slide it belongs to
|
460 |
-
image_tasks.append((i-1, _generate_image(image_prompt, image_path)))
|
461 |
else:
|
462 |
print(f"No image prompt for slide {i}, skipping image generation.")
|
463 |
|
@@ -491,13 +491,14 @@ async def generate_slideshow_with_audio_async(topic: str, **kwargs):
|
|
491 |
return slides_md, audio_files, slide_images
|
492 |
|
493 |
|
494 |
-
def generate_slideshow_with_audio(topic: str, **kwargs):
|
495 |
"""
|
496 |
Synchronous wrapper for the async slideshow generation function.
|
497 |
Maintains backward compatibility with existing code.
|
498 |
|
499 |
Args:
|
500 |
topic: The topic to generate a slideshow about
|
|
|
501 |
**kwargs: Optional parameters including:
|
502 |
- session_id: Unique identifier for the user session
|
503 |
|
@@ -506,25 +507,22 @@ def generate_slideshow_with_audio(topic: str, **kwargs):
|
|
506 |
audio : list[str] β file paths (one per slide, same order)
|
507 |
images : list[str|None] β file paths for slide images (one per slide, same order)
|
508 |
"""
|
509 |
-
return asyncio.run(generate_slideshow_with_audio_async(topic, **kwargs))
|
510 |
|
511 |
|
512 |
-
def validate_topic(topic: str) -> bool:
|
513 |
"""Use Gemini Flash Preview to determine if a topic is suitable for a slideshow."""
|
514 |
-
client = genai.Client(api_key=
|
515 |
system_prompt = f'''
|
516 |
<role>
|
517 |
You are SlideGenInputGuard, an AI assistant that determines if a user input is a suitable topic for a narrated slideshow presentation.
|
518 |
</role>
|
519 |
-
|
520 |
<instructions>
|
521 |
Evaluate if "{topic}" is a real-world topic, question, or concept suitable for an educational slideshow. It is fine to include topics that are silly and not real-world topics.
|
522 |
If it is a valid topic, respond with exactly: 1
|
523 |
If it is nonsense, gibberish, meaningless, empty, or not a valid topic, respond with exactly: 0
|
524 |
-
|
525 |
Only respond with a single digit: 1 or 0. No spaces, newlines or explanations. JUST THE NUMBER 1 OR 0.
|
526 |
</instructions>
|
527 |
-
|
528 |
<examples>
|
529 |
Input:How does lightning form?
|
530 |
Output:1
|
@@ -559,4 +557,4 @@ Output:0
|
|
559 |
config=types.GenerateContentConfig(response_mime_type="text/plain", temperature=0),
|
560 |
)
|
561 |
result = response.text.strip()
|
562 |
-
return result == "1"
|
|
|
1 |
#!/usr/bin/env python3
|
2 |
"""
|
3 |
Generates slide markdown plus TTS audio and images using Gemini models.
|
|
|
4 |
Functions exposed:
|
5 |
+
generate_slideshow_with_audio(topic, api_key) -> (list_of_slide_markdown, list_of_audio_paths, list_of_image_paths)
|
6 |
"""
|
7 |
|
8 |
import asyncio
|
|
|
37 |
print("Deepgram SDK not available. Install with 'pip install deepgram-sdk'")
|
38 |
DEEPGRAM_AVAILABLE = False
|
39 |
|
40 |
+
# Remove the global API key - it will be passed as parameter
|
41 |
DEEPGRAM_KEY = os.environ.get("DEEPGRAM_KEY")
|
42 |
|
43 |
# Dictionary to store temporary directories for cleanup
|
|
|
174 |
|
175 |
|
176 |
# ββββββββββββββββββββββββββββ Gemini Calls βββββββββββββββββββββββββββ
|
177 |
+
async def _generate_image(prompt: str, output_path: Path, api_key: str) -> str:
|
178 |
"""Generate an image using Gemini Imagen model and save it to the specified path."""
|
179 |
+
client = genai.Client(api_key=api_key)
|
180 |
|
181 |
try:
|
182 |
# Make this call in a separate thread to not block the event loop
|
|
|
209 |
print(f"Error generating image: {e}")
|
210 |
return ""
|
211 |
|
212 |
+
def _generate_slideshow_markdown(topic: str, api_key: str) -> str:
|
213 |
"""Ask Gemini 2.5 Flash for a markdown deck following strict rules."""
|
214 |
+
client = genai.Client(api_key=api_key)
|
215 |
#model = "gemini-2.5-flash-preview-05-20"
|
216 |
model = "gemini-2.5-pro-preview-06-05"
|
217 |
|
|
|
219 |
<role>
|
220 |
You are SlideGen, an AI that creates fun and engaging narrated slide decks with visual elements about various topics.
|
221 |
</role>
|
|
|
222 |
<instructions>
|
223 |
Create a presentation about '{topic}'.
|
224 |
Include:
|
225 |
- An introduction slide with bullet points about the overview of the presentation topic and the key areas that will be covered
|
226 |
- 3 content slides with bullet points
|
227 |
- A conclusion slide with bullet points summarizing the key points and insights.
|
|
|
228 |
For each slide provide:
|
229 |
1. Each title should be a single concise and coherent phrase accompanied by exactly one relevant emoji. (Do NOT use the colon ":" format for titles)
|
230 |
2. 3-4 concise bullet points, you will go into more detail in the speaker notes.
|
231 |
3. Clear prose speaker notes suitable for narration that is accessible to general audiences
|
232 |
4. A detailed and specific image prompt for an AI image generator that is relevent to the slide's content. Do not include any text in the image.
|
|
|
|
|
233 |
Respond with a JSON array where each element represents a slide in the following format:
|
234 |
```json
|
235 |
[
|
|
|
270 |
return response.text.strip()
|
271 |
|
272 |
|
273 |
+
async def _generate_tts(narration: str, out_path: Path, api_key: str):
|
274 |
"""GenAI TTS β WAV - Async version with fallback model support"""
|
275 |
+
client = genai.Client(api_key=api_key)
|
276 |
|
277 |
# Try with flash model first, then fall back to pro model if needed
|
278 |
models_to_try = ["gemini-2.5-flash-preview-tts", "gemini-2.5-pro-preview-06-05"]
|
|
|
382 |
|
383 |
|
384 |
# ββββββββββββββββββββββββ Public Entry Point βββββββββββββββββββ
|
385 |
+
async def generate_slideshow_with_audio_async(topic: str, api_key: str, **kwargs):
|
386 |
"""
|
387 |
Async version of generate_slideshow_with_audio that processes slides concurrently.
|
388 |
|
389 |
+
Args:
|
390 |
+
topic: The topic to generate a slideshow about
|
391 |
+
api_key: Gemini API key
|
392 |
+
**kwargs: Optional parameters including session_id
|
393 |
+
|
394 |
Returns:
|
395 |
slides_md : list[str] β markdown for each slide
|
396 |
audio : list[str] β file paths (one per slide, same order)
|
397 |
images : list[str|None] β file paths for slide images (one per slide, same order)
|
398 |
"""
|
399 |
# Get JSON response from Gemini
|
400 |
+
json_response = _generate_slideshow_markdown(topic, api_key)
|
401 |
|
402 |
# Parse JSON into slides data
|
403 |
slides_data = _parse_slides_json(json_response)
|
|
|
439 |
# Schedule TTS task
|
440 |
if narration:
|
441 |
print(f"Scheduling TTS for slide {i} -> {wav_path}")
|
442 |
+
tts_tasks.append(_generate_tts(narration, wav_path, api_key))
|
443 |
else:
|
444 |
# Create empty placeholder WAV if no narration
|
445 |
with open(wav_path, "wb") as f:
|
|
|
457 |
image_path = pres_dir / f"{safe_topic}_slide_{i:02d}_image.jpg"
|
458 |
print(f"Scheduling image for slide {i} -> {image_path}")
|
459 |
# Store task with index to track which slide it belongs to
|
460 |
+
image_tasks.append((i-1, _generate_image(image_prompt, image_path, api_key)))
|
461 |
else:
|
462 |
print(f"No image prompt for slide {i}, skipping image generation.")
|
463 |
|
|
|
491 |
return slides_md, audio_files, slide_images
|
492 |
|
493 |
|
494 |
+
def generate_slideshow_with_audio(topic: str, api_key: str, **kwargs):
|
495 |
"""
|
496 |
Synchronous wrapper for the async slideshow generation function.
|
497 |
Maintains backward compatibility with existing code.
|
498 |
|
499 |
Args:
|
500 |
topic: The topic to generate a slideshow about
|
501 |
+
api_key: Gemini API key
|
502 |
**kwargs: Optional parameters including:
|
503 |
- session_id: Unique identifier for the user session
|
504 |
|
|
|
507 |
audio : list[str] β file paths (one per slide, same order)
|
508 |
images : list[str|None] β file paths for slide images (one per slide, same order)
|
509 |
"""
|
510 |
+
return asyncio.run(generate_slideshow_with_audio_async(topic, api_key, **kwargs))
|
511 |
|
512 |
|
513 |
+
def validate_topic(topic: str, api_key: str) -> bool:
|
514 |
"""Use Gemini Flash Preview to determine if a topic is suitable for a slideshow."""
|
515 |
+
client = genai.Client(api_key=api_key)
|
516 |
system_prompt = f'''
|
517 |
<role>
|
518 |
You are SlideGenInputGuard, an AI assistant that determines if a user input is a suitable topic for a narrated slideshow presentation.
|
519 |
</role>
|
|
|
520 |
<instructions>
|
521 |
Evaluate if "{topic}" is a real-world topic, question, or concept suitable for an educational slideshow. It is fine to include topics that are silly and not real-world topics.
|
522 |
If it is a valid topic, respond with exactly: 1
|
523 |
If it is nonsense, gibberish, meaningless, empty, or not a valid topic, respond with exactly: 0
|
|
|
524 |
Only respond with a single digit: 1 or 0. No spaces, newlines or explanations. JUST THE NUMBER 1 OR 0.
|
525 |
</instructions>
|
|
|
526 |
<examples>
|
527 |
Input:How does lightning form?
|
528 |
Output:1
|
|
|
557 |
config=types.GenerateContentConfig(response_mime_type="text/plain", temperature=0),
|
558 |
)
|
559 |
result = response.text.strip()
|
560 |
+
return result == "1"
|