cp557 commited on
Commit
7f8dde5
Β·
verified Β·
1 Parent(s): 866d021

Update generate_slideshow.py

Browse files
Files changed (1) hide show
  1. generate_slideshow.py +23 -25
generate_slideshow.py CHANGED
@@ -1,9 +1,8 @@
1
  #!/usr/bin/env python3
2
  """
3
  Generates slide markdown plus TTS audio and images using Gemini models.
4
-
5
  Functions exposed:
6
- generate_slideshow_with_audio(topic) -> (list_of_slide_markdown, list_of_audio_paths, list_of_image_paths)
7
  """
8
 
9
  import asyncio
@@ -38,7 +37,7 @@ except ImportError:
38
  print("Deepgram SDK not available. Install with 'pip install deepgram-sdk'")
39
  DEEPGRAM_AVAILABLE = False
40
 
41
- GEMINI_API_KEY = os.environ.get("GEMINI_KEY")
42
  DEEPGRAM_KEY = os.environ.get("DEEPGRAM_KEY")
43
 
44
  # Dictionary to store temporary directories for cleanup
@@ -175,9 +174,9 @@ def _extract_markdown_slides(markdown: str) -> list[dict]:
175
 
176
 
177
  # ──────────────────────────── Gemini Calls ───────────────────────────
178
- async def _generate_image(prompt: str, output_path: Path) -> str:
179
  """Generate an image using Gemini Imagen model and save it to the specified path."""
180
- client = genai.Client(api_key=GEMINI_API_KEY)
181
 
182
  try:
183
  # Make this call in a separate thread to not block the event loop
@@ -210,9 +209,9 @@ async def _generate_image(prompt: str, output_path: Path) -> str:
210
  print(f"Error generating image: {e}")
211
  return ""
212
 
213
- def _generate_slideshow_markdown(topic: str) -> str:
214
  """Ask Gemini 2.5 Flash for a markdown deck following strict rules."""
215
- client = genai.Client(api_key=GEMINI_API_KEY)
216
  #model = "gemini-2.5-flash-preview-05-20"
217
  model = "gemini-2.5-pro-preview-06-05"
218
 
@@ -220,21 +219,17 @@ def _generate_slideshow_markdown(topic: str) -> str:
220
  <role>
221
  You are SlideGen, an AI that creates fun and engaging narrated slide decks with visual elements about various topics.
222
  </role>
223
-
224
  <instructions>
225
  Create a presentation about '{topic}'.
226
  Include:
227
  - An introduction slide with bullet points about the overview of the presentation topic and the key areas that will be covered
228
  - 3 content slides with bullet points
229
  - A conclusion slide with bullet points summarizing the key points and insights.
230
-
231
  For each slide provide:
232
  1. Each title should be a single concise and coherent phrase accompanied by exactly one relevant emoji. (Do NOT use the colon ":" format for titles)
233
  2. 3-4 concise bullet points, you will go into more detail in the speaker notes.
234
  3. Clear prose speaker notes suitable for narration that is accessible to general audiences
235
  4. A detailed and specific image prompt for an AI image generator that is relevent to the slide's content. Do not include any text in the image.
236
-
237
-
238
  Respond with a JSON array where each element represents a slide in the following format:
239
  ```json
240
  [
@@ -275,9 +270,9 @@ Respond with a JSON array where each element represents a slide in the following
275
  return response.text.strip()
276
 
277
 
278
- async def _generate_tts(narration: str, out_path: Path):
279
  """GenAI TTS β†’ WAV - Async version with fallback model support"""
280
- client = genai.Client(api_key=GEMINI_API_KEY)
281
 
282
  # Try with flash model first, then fall back to pro model if needed
283
  models_to_try = ["gemini-2.5-flash-preview-tts", "gemini-2.5-pro-preview-06-05"]
@@ -387,17 +382,22 @@ def _generate_tts_with_deepgram(narration: str, out_path: Path):
387
 
388
 
389
  # ──────────────────────── Public Entry Point ───────────────────
390
- async def generate_slideshow_with_audio_async(topic: str, **kwargs):
391
  """
392
  Async version of generate_slideshow_with_audio that processes slides concurrently.
393
 
 
 
 
 
 
394
  Returns:
395
  slides_md : list[str] – markdown for each slide
396
  audio : list[str] – file paths (one per slide, same order)
397
  images : list[str|None] – file paths for slide images (one per slide, same order)
398
  """
399
  # Get JSON response from Gemini
400
- json_response = _generate_slideshow_markdown(topic)
401
 
402
  # Parse JSON into slides data
403
  slides_data = _parse_slides_json(json_response)
@@ -439,7 +439,7 @@ async def generate_slideshow_with_audio_async(topic: str, **kwargs):
439
  # Schedule TTS task
440
  if narration:
441
  print(f"Scheduling TTS for slide {i} -> {wav_path}")
442
- tts_tasks.append(_generate_tts(narration, wav_path))
443
  else:
444
  # Create empty placeholder WAV if no narration
445
  with open(wav_path, "wb") as f:
@@ -457,7 +457,7 @@ async def generate_slideshow_with_audio_async(topic: str, **kwargs):
457
  image_path = pres_dir / f"{safe_topic}_slide_{i:02d}_image.jpg"
458
  print(f"Scheduling image for slide {i} -> {image_path}")
459
  # Store task with index to track which slide it belongs to
460
- image_tasks.append((i-1, _generate_image(image_prompt, image_path)))
461
  else:
462
  print(f"No image prompt for slide {i}, skipping image generation.")
463
 
@@ -491,13 +491,14 @@ async def generate_slideshow_with_audio_async(topic: str, **kwargs):
491
  return slides_md, audio_files, slide_images
492
 
493
 
494
- def generate_slideshow_with_audio(topic: str, **kwargs):
495
  """
496
  Synchronous wrapper for the async slideshow generation function.
497
  Maintains backward compatibility with existing code.
498
 
499
  Args:
500
  topic: The topic to generate a slideshow about
 
501
  **kwargs: Optional parameters including:
502
  - session_id: Unique identifier for the user session
503
 
@@ -506,25 +507,22 @@ def generate_slideshow_with_audio(topic: str, **kwargs):
506
  audio : list[str] – file paths (one per slide, same order)
507
  images : list[str|None] – file paths for slide images (one per slide, same order)
508
  """
509
- return asyncio.run(generate_slideshow_with_audio_async(topic, **kwargs))
510
 
511
 
512
- def validate_topic(topic: str) -> bool:
513
  """Use Gemini Flash Preview to determine if a topic is suitable for a slideshow."""
514
- client = genai.Client(api_key=GEMINI_API_KEY)
515
  system_prompt = f'''
516
  <role>
517
  You are SlideGenInputGuard, an AI assistant that determines if a user input is a suitable topic for a narrated slideshow presentation.
518
  </role>
519
-
520
  <instructions>
521
  Evaluate if "{topic}" is a real-world topic, question, or concept suitable for an educational slideshow. It is fine to include topics that are silly and not real-world topics.
522
  If it is a valid topic, respond with exactly: 1
523
  If it is nonsense, gibberish, meaningless, empty, or not a valid topic, respond with exactly: 0
524
-
525
  Only respond with a single digit: 1 or 0. No spaces, newlines or explanations. JUST THE NUMBER 1 OR 0.
526
  </instructions>
527
-
528
  <examples>
529
  Input:How does lightning form?
530
  Output:1
@@ -559,4 +557,4 @@ Output:0
559
  config=types.GenerateContentConfig(response_mime_type="text/plain", temperature=0),
560
  )
561
  result = response.text.strip()
562
- return result == "1"
 
1
  #!/usr/bin/env python3
2
  """
3
  Generates slide markdown plus TTS audio and images using Gemini models.
 
4
  Functions exposed:
5
+ generate_slideshow_with_audio(topic, api_key) -> (list_of_slide_markdown, list_of_audio_paths, list_of_image_paths)
6
  """
7
 
8
  import asyncio
 
37
  print("Deepgram SDK not available. Install with 'pip install deepgram-sdk'")
38
  DEEPGRAM_AVAILABLE = False
39
 
40
+ # Remove the global API key - it will be passed as parameter
41
  DEEPGRAM_KEY = os.environ.get("DEEPGRAM_KEY")
42
 
43
  # Dictionary to store temporary directories for cleanup
 
174
 
175
 
176
  # ──────────────────────────── Gemini Calls ───────────────────────────
177
+ async def _generate_image(prompt: str, output_path: Path, api_key: str) -> str:
178
  """Generate an image using Gemini Imagen model and save it to the specified path."""
179
+ client = genai.Client(api_key=api_key)
180
 
181
  try:
182
  # Make this call in a separate thread to not block the event loop
 
209
  print(f"Error generating image: {e}")
210
  return ""
211
 
212
+ def _generate_slideshow_markdown(topic: str, api_key: str) -> str:
213
  """Ask Gemini 2.5 Flash for a markdown deck following strict rules."""
214
+ client = genai.Client(api_key=api_key)
215
  #model = "gemini-2.5-flash-preview-05-20"
216
  model = "gemini-2.5-pro-preview-06-05"
217
 
 
219
  <role>
220
  You are SlideGen, an AI that creates fun and engaging narrated slide decks with visual elements about various topics.
221
  </role>
 
222
  <instructions>
223
  Create a presentation about '{topic}'.
224
  Include:
225
  - An introduction slide with bullet points about the overview of the presentation topic and the key areas that will be covered
226
  - 3 content slides with bullet points
227
  - A conclusion slide with bullet points summarizing the key points and insights.
 
228
  For each slide provide:
229
  1. Each title should be a single concise and coherent phrase accompanied by exactly one relevant emoji. (Do NOT use the colon ":" format for titles)
230
  2. 3-4 concise bullet points, you will go into more detail in the speaker notes.
231
  3. Clear prose speaker notes suitable for narration that is accessible to general audiences
232
  4. A detailed and specific image prompt for an AI image generator that is relevent to the slide's content. Do not include any text in the image.
 
 
233
  Respond with a JSON array where each element represents a slide in the following format:
234
  ```json
235
  [
 
270
  return response.text.strip()
271
 
272
 
273
+ async def _generate_tts(narration: str, out_path: Path, api_key: str):
274
  """GenAI TTS β†’ WAV - Async version with fallback model support"""
275
+ client = genai.Client(api_key=api_key)
276
 
277
  # Try with flash model first, then fall back to pro model if needed
278
  models_to_try = ["gemini-2.5-flash-preview-tts", "gemini-2.5-pro-preview-06-05"]
 
382
 
383
 
384
  # ──────────────────────── Public Entry Point ───────────────────
385
+ async def generate_slideshow_with_audio_async(topic: str, api_key: str, **kwargs):
386
  """
387
  Async version of generate_slideshow_with_audio that processes slides concurrently.
388
 
389
+ Args:
390
+ topic: The topic to generate a slideshow about
391
+ api_key: Gemini API key
392
+ **kwargs: Optional parameters including session_id
393
+
394
  Returns:
395
  slides_md : list[str] – markdown for each slide
396
  audio : list[str] – file paths (one per slide, same order)
397
  images : list[str|None] – file paths for slide images (one per slide, same order)
398
  """
399
  # Get JSON response from Gemini
400
+ json_response = _generate_slideshow_markdown(topic, api_key)
401
 
402
  # Parse JSON into slides data
403
  slides_data = _parse_slides_json(json_response)
 
439
  # Schedule TTS task
440
  if narration:
441
  print(f"Scheduling TTS for slide {i} -> {wav_path}")
442
+ tts_tasks.append(_generate_tts(narration, wav_path, api_key))
443
  else:
444
  # Create empty placeholder WAV if no narration
445
  with open(wav_path, "wb") as f:
 
457
  image_path = pres_dir / f"{safe_topic}_slide_{i:02d}_image.jpg"
458
  print(f"Scheduling image for slide {i} -> {image_path}")
459
  # Store task with index to track which slide it belongs to
460
+ image_tasks.append((i-1, _generate_image(image_prompt, image_path, api_key)))
461
  else:
462
  print(f"No image prompt for slide {i}, skipping image generation.")
463
 
 
491
  return slides_md, audio_files, slide_images
492
 
493
 
494
+ def generate_slideshow_with_audio(topic: str, api_key: str, **kwargs):
495
  """
496
  Synchronous wrapper for the async slideshow generation function.
497
  Maintains backward compatibility with existing code.
498
 
499
  Args:
500
  topic: The topic to generate a slideshow about
501
+ api_key: Gemini API key
502
  **kwargs: Optional parameters including:
503
  - session_id: Unique identifier for the user session
504
 
 
507
  audio : list[str] – file paths (one per slide, same order)
508
  images : list[str|None] – file paths for slide images (one per slide, same order)
509
  """
510
+ return asyncio.run(generate_slideshow_with_audio_async(topic, api_key, **kwargs))
511
 
512
 
513
+ def validate_topic(topic: str, api_key: str) -> bool:
514
  """Use Gemini Flash Preview to determine if a topic is suitable for a slideshow."""
515
+ client = genai.Client(api_key=api_key)
516
  system_prompt = f'''
517
  <role>
518
  You are SlideGenInputGuard, an AI assistant that determines if a user input is a suitable topic for a narrated slideshow presentation.
519
  </role>
 
520
  <instructions>
521
  Evaluate if "{topic}" is a real-world topic, question, or concept suitable for an educational slideshow. It is fine to include topics that are silly and not real-world topics.
522
  If it is a valid topic, respond with exactly: 1
523
  If it is nonsense, gibberish, meaningless, empty, or not a valid topic, respond with exactly: 0
 
524
  Only respond with a single digit: 1 or 0. No spaces, newlines or explanations. JUST THE NUMBER 1 OR 0.
525
  </instructions>
 
526
  <examples>
527
  Input:How does lightning form?
528
  Output:1
 
557
  config=types.GenerateContentConfig(response_mime_type="text/plain", temperature=0),
558
  )
559
  result = response.text.strip()
560
+ return result == "1"