Nick088 commited on
Commit
caaeb6e
·
verified ·
1 Parent(s): b667bd4

Add TimeStamp Granularities

Browse files
Files changed (1) hide show
  1. app.py +286 -129
app.py CHANGED
@@ -243,11 +243,15 @@ def check_file(input_file_path):
243
 
244
  # subtitle maker
245
 
246
- def format_time(seconds):
247
- hours = int(seconds // 3600)
248
- minutes = int((seconds % 3600) // 60)
249
- seconds = int(seconds % 60)
250
- milliseconds = int((seconds % 1) * 1000)
 
 
 
 
251
 
252
  return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}"
253
 
@@ -265,173 +269,324 @@ def json_to_srt(transcription_json):
265
  return '\n'.join(srt_lines)
266
 
267
 
268
- def generate_subtitles(input_file, prompt, language, auto_detect_language, model, include_video, font_selection, font_file, font_color, font_size, outline_thickness, outline_color):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269
 
270
  input_file_path = input_file
271
 
272
  processed_path, split_status = check_file(input_file_path)
273
- full_srt_content = ""
274
- total_duration = 0
275
- segment_id_offset = 0
 
 
 
 
 
 
 
 
276
 
 
277
  if split_status == "split":
278
- srt_chunks = []
279
- video_chunks = []
280
  for i, chunk_path in enumerate(processed_path):
 
 
 
281
  try:
 
282
  with open(chunk_path, "rb") as file:
283
  transcription_json_response = client.audio.transcriptions.create(
284
  file=(os.path.basename(chunk_path), file.read()),
285
  model=model,
286
  prompt=prompt,
287
  response_format="verbose_json",
 
288
  language=None if auto_detect_language else language,
289
  temperature=0.0,
290
  )
291
- transcription_json = transcription_json_response.segments
292
-
293
- # Adjust timestamps and segment IDs
294
- for segment in transcription_json:
295
- segment['start'] += total_duration
296
- segment['end'] += total_duration
297
- segment['id'] += segment_id_offset
298
- segment_id_offset += len(transcription_json)
299
- total_duration += transcription_json[-1]['end'] # Update total duration
300
-
301
- srt_content = json_to_srt(transcription_json)
302
- full_srt_content += srt_content
303
- temp_srt_path = f"{os.path.splitext(chunk_path)[0]}.srt"
304
- with open(temp_srt_path, "w", encoding="utf-8") as temp_srt_file:
305
- temp_srt_file.write(srt_content)
306
- temp_srt_file.write("\n") # add a new line at the end of the srt chunk file to fix format when merged
307
- srt_chunks.append(temp_srt_path)
308
 
309
- if include_video and input_file_path.lower().endswith((".mp4", ".webm")):
310
- try:
311
- output_file_path = chunk_path.replace(os.path.splitext(chunk_path)[1], "_with_subs" + os.path.splitext(chunk_path)[1])
312
- # Handle font selection
313
- if font_selection == "Custom Font File" and font_file:
314
- font_name = os.path.splitext(os.path.basename(font_file.name))[0] # Get font filename without extension
315
- font_dir = os.path.dirname(font_file.name) # Get font directory path
316
- elif font_selection == "Custom Font File" and not font_file:
317
- font_name = None # Let FFmpeg use its default Arial
318
- font_dir = None # No font directory
319
- gr.Warning(f"You want to use a Custom Font File, but uploaded none. Using the default Arial font.")
320
- elif font_selection == "Arial":
321
- font_name = None # Let FFmpeg use its default Arial
322
- font_dir = None # No font directory
323
 
324
- # FFmpeg command
325
- subprocess.run(
326
- [
327
- "ffmpeg",
328
- "-y",
329
- "-i",
330
- chunk_path,
331
- "-vf",
332
- f"subtitles={temp_srt_path}:fontsdir={font_dir}:force_style='Fontname={font_name},Fontsize={int(font_size)},PrimaryColour=&H{font_color[1:]}&,OutlineColour=&H{outline_color[1:]}&,BorderStyle={int(outline_thickness)},Outline=1'",
333
- "-preset", "fast",
334
- output_file_path,
335
- ],
336
- check=True,
337
- )
338
- video_chunks.append(output_file_path)
339
- except subprocess.CalledProcessError as e:
340
- raise gr.Error(f"Error during subtitle addition: {e}")
341
- elif include_video and not input_file_path.lower().endswith((".mp4", ".webm")):
342
- gr.Warning(f"You have checked on the 'Include Video with Subtitles', but the input file {input_file_path} isn't a video (.mp4 or .webm). Returning only the SRT File.", duration=15)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
343
  except groq.AuthenticationError as e:
344
- handle_groq_error(e, model)
345
  except groq.RateLimitError as e:
346
- handle_groq_error(e, model)
347
- gr.Warning(f"API limit reached during chunk {i+1}. Returning processed chunks only.")
348
- if srt_chunks and video_chunks:
349
- merge_audio(video_chunks, 'merged_output_video.mp4')
350
- with open('merged_output.srt', 'w', encoding="utf-8") as outfile:
351
- for chunk_srt in srt_chunks:
352
- with open(chunk_srt, 'r', encoding="utf-8") as infile:
353
- outfile.write(infile.read())
354
- return 'merged_output.srt', 'merged_output_video.mp4'
355
- else:
356
- raise gr.Error("Subtitle generation failed due to API limits.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
357
 
358
- # Merge SRT chunks
359
- final_srt_path = os.path.splitext(input_file_path)[0] + "_final.srt"
360
- with open(final_srt_path, 'w', encoding="utf-8") as outfile:
361
- for chunk_srt in srt_chunks:
362
- with open(chunk_srt, 'r', encoding="utf-8") as infile:
363
- outfile.write(infile.read())
364
 
365
- # Merge video chunks
366
  if video_chunks:
367
- merge_audio(video_chunks, 'merged_output_video.mp4')
368
- return final_srt_path, 'merged_output_video.mp4'
369
- else:
370
- return final_srt_path, None
 
 
 
 
 
 
 
 
 
 
371
 
372
  else: # Single file processing (no splitting)
 
 
 
 
373
  try:
 
374
  with open(processed_path, "rb") as file:
375
  transcription_json_response = client.audio.transcriptions.create(
376
  file=(os.path.basename(processed_path), file.read()),
377
  model=model,
378
  prompt=prompt,
379
  response_format="verbose_json",
 
380
  language=None if auto_detect_language else language,
381
  temperature=0.0,
382
  )
383
- transcription_json = transcription_json_response.segments
384
 
385
- srt_content = json_to_srt(transcription_json)
386
- temp_srt_path = os.path.splitext(input_file_path)[0] + ".srt"
387
- with open(temp_srt_path, "w", encoding="utf-8") as temp_srt_file:
388
- temp_srt_file.write(srt_content)
389
 
390
- if include_video and input_file_path.lower().endswith((".mp4", ".webm")):
391
- try:
392
- output_file_path = input_file_path.replace(
393
- os.path.splitext(input_file_path)[1], "_with_subs" + os.path.splitext(input_file_path)[1]
394
- )
395
- # Handle font selection
396
- if font_selection == "Custom Font File" and font_file:
397
- font_name = os.path.splitext(os.path.basename(font_file.name))[0] # Get font filename without extension
398
- font_dir = os.path.dirname(font_file.name) # Get font directory path
399
- elif font_selection == "Custom Font File" and not font_file:
400
- font_name = None # Let FFmpeg use its default Arial
401
- font_dir = None # No font directory
402
- gr.Warning(f"You want to use a Custom Font File, but uploaded none. Using the default Arial font.")
403
- elif font_selection == "Arial":
404
- font_name = None # Let FFmpeg use its default Arial
405
- font_dir = None # No font directory
406
-
407
- # FFmpeg command
408
- subprocess.run(
409
- [
410
- "ffmpeg",
411
- "-y",
412
- "-i",
413
- input_file_path,
414
- "-vf",
415
- f"subtitles={temp_srt_path}:fontsdir={font_dir}:force_style='FontName={font_name},Fontsize={int(font_size)},PrimaryColour=&H{font_color[1:]}&,OutlineColour=&H{outline_color[1:]}&,BorderStyle={int(outline_thickness)},Outline=1'",
416
- "-preset", "fast",
417
- output_file_path,
418
- ],
419
- check=True,
420
- )
421
- return temp_srt_path, output_file_path
422
- except subprocess.CalledProcessError as e:
423
- raise gr.Error(f"Error during subtitle addition: {e}")
424
- elif include_video and not input_file_path.lower().endswith((".mp4", ".webm")):
425
- gr.Warning(f"You have checked on the 'Include Video with Subtitles', but the input file {input_file_path} isn't a video (.mp4 or .webm). Returning only the SRT File.", duration=15)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
426
 
427
- return temp_srt_path, None
 
 
 
 
 
 
 
428
  except groq.AuthenticationError as e:
429
  handle_groq_error(e, model)
430
  except groq.RateLimitError as e:
431
  handle_groq_error(e, model)
432
- except ValueError as e:
433
- raise gr.Error(f"Error creating SRT file: {e}")
434
-
 
 
 
 
 
 
 
 
435
 
436
  theme = gr.themes.Soft(
437
  primary_hue="sky",
@@ -483,6 +638,7 @@ with gr.Blocks(theme=theme, css=css) as interface:
483
  # Model and options
484
  model_choice_subtitles = gr.Dropdown(choices=["whisper-large-v3", "whisper-large-v3-turbo", "distil-whisper-large-v3-en"], value="whisper-large-v3-turbo", label="Audio Speech Recogition (ASR) Model", info="'whisper-large-v3' = Multilingual high quality, 'whisper-large-v3-turbo' = Multilingual fast with minimal impact on quality, good balance, 'distil-whisper-large-v3-en' = English only, fastest with also slight impact on quality")
485
  transcribe_prompt_subtitles = gr.Textbox(label="Prompt (Optional)", info="Specify any context or spelling corrections.")
 
486
  with gr.Row():
487
  language_subtitles = gr.Dropdown(choices=[(lang, code) for lang, code in LANGUAGE_CODES.items()], value="en", label="Language")
488
  auto_detect_language_subtitles = gr.Checkbox(label="Auto Detect Language")
@@ -536,6 +692,7 @@ with gr.Blocks(theme=theme, css=css) as interface:
536
  inputs=[
537
  input_file,
538
  transcribe_prompt_subtitles,
 
539
  language_subtitles,
540
  auto_detect_language_subtitles,
541
  model_choice_subtitles,
 
243
 
244
  # subtitle maker
245
 
246
+ def format_time(seconds_float):
247
+ # Calculate total whole seconds and milliseconds
248
+ total_seconds = int(seconds_float)
249
+ milliseconds = int((seconds_float - total_seconds) * 1000)
250
+
251
+ # Calculate hours, minutes, and remaining seconds
252
+ hours = total_seconds // 3600
253
+ minutes = (total_seconds % 3600) // 60
254
+ seconds = total_seconds % 60
255
 
256
  return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}"
257
 
 
269
  return '\n'.join(srt_lines)
270
 
271
 
272
+ def words_json_to_srt(words_data, starting_id=0):
273
+ srt_lines = []
274
+ previous_end_time = 0.0 # Keep track of the end time of the previous word
275
+
276
+ for i, word_entry in enumerate(words_data):
277
+ # Get original start and end times
278
+ start_seconds = word_entry['start']
279
+ end_seconds = word_entry['end']
280
+
281
+ # --- Overlap Prevention Logic ---
282
+ # Ensure the start time is not before the previous word ended
283
+ start_seconds = max(start_seconds, previous_end_time)
284
+
285
+ # Ensure the end time is not before the start time (can happen with adjustments)
286
+ # And add a tiny minimum duration (e.g., 50ms) if start and end are identical,
287
+ # otherwise the subtitle might flash too quickly or be ignored by players.
288
+ min_duration = 0.050 # 50 milliseconds
289
+ if end_seconds <= start_seconds:
290
+ end_seconds = start_seconds + min_duration
291
+ # --- End of Overlap Prevention ---
292
+
293
+ # Format the potentially adjusted times
294
+ start_time_fmt = format_time(start_seconds)
295
+ end_time_fmt = format_time(end_seconds)
296
+ text = word_entry['word']
297
+ srt_id = starting_id + i + 1
298
+
299
+ srt_line = f"{srt_id}\n{start_time_fmt} --> {end_time_fmt}\n{text}\n"
300
+ srt_lines.append(srt_line)
301
+
302
+ # Update previous_end_time for the next iteration using the *adjusted* end time
303
+ previous_end_time = end_seconds
304
+
305
+ return '\n'.join(srt_lines)
306
+
307
+ def generate_subtitles(input_file, prompt, timestamp_granularities_str, language, auto_detect_language, model, include_video, font_selection, font_file, font_color, font_size, outline_thickness, outline_color):
308
 
309
  input_file_path = input_file
310
 
311
  processed_path, split_status = check_file(input_file_path)
312
+ full_srt_content = "" # Used for accumulating SRT content string for split files
313
+ srt_chunks_paths = [] # Used to store paths of individual SRT chunk files for merging
314
+ video_chunks = [] # Used to store paths of video chunks with embedded subs
315
+ total_duration = 0 # Cumulative duration for timestamp adjustment in split files
316
+ srt_entry_offset = 0 # Cumulative SRT entry count (words or segments) for ID adjustment
317
+
318
+ # transforms the gradio dropdown choice str to a python list needed for the groq api
319
+ timestamp_granularities_list = [gran.strip() for gran in timestamp_granularities_str.split(',') if gran.strip()]
320
+
321
+ # Determine primary granularity for logic (prefer word if both specified, else segment)
322
+ primary_granularity = "word" if "word" in timestamp_granularities_list else "segment"
323
 
324
+ # handling splitted files or single ones
325
  if split_status == "split":
 
 
326
  for i, chunk_path in enumerate(processed_path):
327
+ chunk_srt_content = "" # SRT content for the current chunk
328
+ temp_srt_path = f"{os.path.splitext(chunk_path)[0]}.srt" # Path for this chunk's SRT file
329
+
330
  try:
331
+ gr.Info(f"Processing chunk {i+1}/{len(processed_path)}...")
332
  with open(chunk_path, "rb") as file:
333
  transcription_json_response = client.audio.transcriptions.create(
334
  file=(os.path.basename(chunk_path), file.read()),
335
  model=model,
336
  prompt=prompt,
337
  response_format="verbose_json",
338
+ timestamp_granularities=timestamp_granularities_list,
339
  language=None if auto_detect_language else language,
340
  temperature=0.0,
341
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
342
 
343
+ if primary_granularity == "word":
344
+ word_data = transcription_json_response.words
345
+ if word_data:
346
+ # Adjust timestamps BEFORE generating SRT
347
+ adjusted_word_data = []
348
+ for entry in word_data:
349
+ adjusted_entry = entry.copy()
350
+ adjusted_entry['start'] += total_duration
351
+ adjusted_entry['end'] += total_duration
352
+ adjusted_word_data.append(adjusted_entry)
 
 
 
 
353
 
354
+ # Generate SRT using adjusted data and current offset
355
+ chunk_srt_content = words_json_to_srt(adjusted_word_data, srt_entry_offset)
356
+
357
+ # Update offsets for the *next* chunk
358
+ total_duration = adjusted_word_data[-1]['end'] # Use adjusted end time
359
+ srt_entry_offset += len(word_data) # Increment by number of words in this chunk
360
+ else:
361
+ gr.Warning(f"API returned no word timestamps for chunk {i+1}.")
362
+
363
+ elif primary_granularity == "segment":
364
+ segment_data = transcription_json_response.segments
365
+ if segment_data:
366
+ # Adjust timestamps and IDs BEFORE generating SRT
367
+ adjusted_segment_data = []
368
+ max_original_id = -1
369
+ for entry in segment_data:
370
+ adjusted_entry = entry.copy()
371
+ adjusted_entry['start'] += total_duration
372
+ adjusted_entry['end'] += total_duration
373
+ max_original_id = max(max_original_id, adjusted_entry['id']) # Track max original ID for offset calc
374
+ adjusted_entry['id'] += srt_entry_offset # Adjust ID for SRT generation
375
+ adjusted_segment_data.append(adjusted_entry)
376
+
377
+ # Generate SRT using adjusted data
378
+ chunk_srt_content = json_to_srt(adjusted_segment_data) # json_to_srt uses the 'id' field directly
379
+
380
+ # Update offsets for the *next* chunk
381
+ total_duration = adjusted_segment_data[-1]['end'] # Use adjusted end time
382
+ srt_entry_offset += (max_original_id + 1) # Increment by number of segments in this chunk (based on original IDs)
383
+ else:
384
+ gr.Warning(f"API returned no segment timestamps for chunk {i+1}.")
385
+ else:
386
+ # This case should ideally not be reached due to dropdown default/logic
387
+ gr.Warning(f"Invalid timestamp granularity for chunk {i+1}. Skipping SRT generation for this chunk.")
388
+
389
+ # Write and store path for this chunk's SRT file if content exists
390
+ if chunk_srt_content:
391
+ with open(temp_srt_path, "w", encoding="utf-8") as temp_srt_file:
392
+ temp_srt_file.write(chunk_srt_content)
393
+ srt_chunks_paths.append(temp_srt_path)
394
+ full_srt_content += chunk_srt_content # Append to the full content string as well
395
+
396
+ # Video embedding for the chunk
397
+ if include_video and input_file_path.lower().endswith((".mp4", ".webm")):
398
+ try:
399
+ output_video_chunk_path = chunk_path.replace(os.path.splitext(chunk_path)[1], "_with_subs" + os.path.splitext(chunk_path)[1])
400
+ # Handle font selection
401
+ font_name = None
402
+ font_dir = None
403
+ if font_selection == "Custom Font File" and font_file:
404
+ font_name = os.path.splitext(os.path.basename(font_file.name))[0]
405
+ font_dir = os.path.dirname(font_file.name)
406
+ elif font_selection == "Custom Font File" and not font_file:
407
+ gr.Warning(f"Custom Font File selected but none uploaded. Using default font for chunk {i+1}.")
408
+
409
+ # FFmpeg command for the chunk
410
+ subprocess.run(
411
+ [
412
+ "ffmpeg", "-y", "-i", chunk_path,
413
+ "-vf", f"subtitles={temp_srt_path}:fontsdir={font_dir}:force_style='FontName={font_name},Fontsize={int(font_size)},PrimaryColour=&H{font_color[1:]}&,OutlineColour=&H{outline_color[1:]}&,BorderStyle={int(outline_thickness)},Outline=1'",
414
+ "-preset", "fast", output_video_chunk_path,
415
+ ], check=True,
416
+ )
417
+ video_chunks.append(output_video_chunk_path)
418
+ except subprocess.CalledProcessError as e:
419
+ # Warn but continue processing other chunks
420
+ gr.Warning(f"Error adding subtitles to video chunk {i+1}: {e}. Skipping video for this chunk.")
421
+ except Exception as e: # Catch other potential errors during font handling etc.
422
+ gr.Warning(f"Error preparing subtitle style for video chunk {i+1}: {e}. Skipping video for this chunk.")
423
+
424
+ elif include_video and i == 0: # Show warning only once for non-video input
425
+ gr.Warning(f"Include Video checked, but input isn't MP4/WebM. Only SRT will be generated.", duration=15)
426
+
427
+
428
  except groq.AuthenticationError as e:
429
+ handle_groq_error(e, model) # This will raise gr.Error and stop execution
430
  except groq.RateLimitError as e:
431
+ handle_groq_error(e, model) # This will raise gr.Error and stop execution
432
+ except Exception as e:
433
+ gr.Warning(f"Error processing chunk {i+1}: {e}. Skipping this chunk.")
434
+ # Remove potentially incomplete SRT for this chunk if it exists
435
+ if os.path.exists(temp_srt_path):
436
+ try: os.remove(temp_srt_path)
437
+ except: pass
438
+ continue # Move to the next chunk
439
+
440
+ # After processing all chunks
441
+ final_srt_path = None
442
+ final_video_path = None
443
+
444
+ # Merge SRT chunks if any were created
445
+ if srt_chunks_paths:
446
+ final_srt_path = os.path.splitext(input_file_path)[0] + "_final.srt"
447
+ gr.Info("Merging SRT chunks...")
448
+ with open(final_srt_path, 'w', encoding="utf-8") as outfile:
449
+ # Use the full_srt_content string which ensures correct order and content
450
+ outfile.write(full_srt_content)
451
+ # Clean up individual srt chunks paths
452
+ for srt_chunk_file in srt_chunks_paths:
453
+ try: os.remove(srt_chunk_file)
454
+ except: pass
455
+ # Clean up intermediate audio chunks used for transcription
456
+ for chunk in processed_path:
457
+ try: os.remove(chunk)
458
+ except: pass
459
+ else:
460
+ gr.Warning("No SRT content was generated from any chunk.")
461
 
 
 
 
 
 
 
462
 
463
+ # Merge video chunks if any were created
464
  if video_chunks:
465
+ # Check if number of video chunks matches expected number based on successful SRT generation
466
+ if len(video_chunks) != len(srt_chunks_paths):
467
+ gr.Warning("Mismatch between successful SRT chunks and video chunks created. Video merge might be incomplete.")
468
+
469
+ final_video_path = os.path.splitext(input_file_path)[0] + '_merged_video_with_subs.mp4' # More descriptive name
470
+ gr.Info("Merging video chunks...")
471
+ try:
472
+ merge_audio(video_chunks, final_video_path) # Re-using merge_audio logic for video files
473
+ # video_chunks are removed inside merge_audio if successful
474
+ except Exception as e:
475
+ gr.Error(f"Failed to merge video chunks: {e}")
476
+ final_video_path = None # Indicate failure
477
+
478
+ return final_srt_path, final_video_path
479
 
480
  else: # Single file processing (no splitting)
481
+ final_srt_path = None
482
+ final_video_path = None
483
+ temp_srt_path = os.path.splitext(processed_path)[0] + ".srt" # Use processed_path for naming
484
+
485
  try:
486
+ gr.Info("Processing file...")
487
  with open(processed_path, "rb") as file:
488
  transcription_json_response = client.audio.transcriptions.create(
489
  file=(os.path.basename(processed_path), file.read()),
490
  model=model,
491
  prompt=prompt,
492
  response_format="verbose_json",
493
+ timestamp_granularities=timestamp_granularities_list,
494
  language=None if auto_detect_language else language,
495
  temperature=0.0,
496
  )
 
497
 
498
+ srt_content = "" # Initialize
 
 
 
499
 
500
+ if primary_granularity == "word":
501
+ word_data = transcription_json_response.words
502
+ if word_data:
503
+ srt_content = words_json_to_srt(word_data, 0) # Start IDs from 0
504
+ else:
505
+ gr.Warning("API returned no word timestamps.")
506
+ elif primary_granularity == "segment":
507
+ segment_data = transcription_json_response.segments
508
+ if segment_data:
509
+ # No need to adjust IDs/timestamps for single file
510
+ srt_content = json_to_srt(segment_data)
511
+ else:
512
+ gr.Warning("API returned no segment timestamps.")
513
+ else:
514
+ # Should not happen
515
+ gr.Warning("Invalid timestamp granularity selected. Skipping SRT generation.")
516
+
517
+ # Write SRT file if content exists
518
+ if srt_content:
519
+ with open(temp_srt_path, "w", encoding="utf-8") as temp_srt_file:
520
+ temp_srt_file.write(srt_content)
521
+ final_srt_path = temp_srt_path # Set the final path
522
+
523
+ # Video embedding logic
524
+ if include_video and input_file_path.lower().endswith((".mp4", ".webm")):
525
+ try:
526
+ output_video_path = processed_path.replace(
527
+ os.path.splitext(processed_path)[1], "_with_subs" + os.path.splitext(processed_path)[1]
528
+ )
529
+ # Handle font selection
530
+ font_name = None
531
+ font_dir = None
532
+ if font_selection == "Custom Font File" and font_file:
533
+ font_name = os.path.splitext(os.path.basename(font_file.name))[0]
534
+ font_dir = os.path.dirname(font_file.name)
535
+ elif font_selection == "Custom Font File" and not font_file:
536
+ gr.Warning(f"Custom Font File selected but none uploaded. Using default font.")
537
+
538
+ # FFmpeg command
539
+ gr.Info("Adding subtitles to video...")
540
+ subprocess.run(
541
+ [
542
+ "ffmpeg", "-y", "-i", processed_path, # Use processed_path as input
543
+ "-vf", f"subtitles={temp_srt_path}:fontsdir={font_dir}:force_style='FontName={font_name},Fontsize={int(font_size)},PrimaryColour=&H{font_color[1:]}&,OutlineColour=&H{outline_color[1:]}&,BorderStyle={int(outline_thickness)},Outline=1'",
544
+ "-preset", "fast", output_video_path,
545
+ ], check=True,
546
+ )
547
+ final_video_path = output_video_path
548
+ except subprocess.CalledProcessError as e:
549
+ gr.Error(f"Error during subtitle addition: {e}")
550
+ # Keep SRT file, but no video output
551
+ final_video_path = None
552
+ except Exception as e:
553
+ gr.Error(f"Error preparing subtitle style for video: {e}")
554
+ final_video_path = None
555
+
556
+ elif include_video:
557
+ # Warning for non-video input shown once
558
+ gr.Warning(f"Include Video checked, but input isn't MP4/WebM. Only SRT will be generated.", duration=15)
559
+
560
+ # Clean up downsampled file if it was created and different from original input
561
+ if processed_path != input_file_path and os.path.exists(processed_path):
562
+ try: os.remove(processed_path)
563
+ except: pass
564
+
565
+ return final_srt_path, final_video_path # Return paths (video might be None)
566
 
567
+ else: # No SRT content generated
568
+ gr.Warning("No SRT content could be generated.")
569
+ # Clean up downsampled file if created
570
+ if processed_path != input_file_path and os.path.exists(processed_path):
571
+ try: os.remove(processed_path)
572
+ except: pass
573
+ return None, None # Return None for both outputs
574
+
575
  except groq.AuthenticationError as e:
576
  handle_groq_error(e, model)
577
  except groq.RateLimitError as e:
578
  handle_groq_error(e, model)
579
+ except Exception as e: # Catch any other error during single file processing
580
+ # Clean up downsampled file if created
581
+ if processed_path != input_file_path and os.path.exists(processed_path):
582
+ try: os.remove(processed_path)
583
+ except: pass
584
+ # Clean up potentially created empty SRT
585
+ if os.path.exists(temp_srt_path):
586
+ try: os.remove(temp_srt_path)
587
+ except: pass
588
+ raise gr.Error(f"An unexpected error occurred: {e}")
589
+
590
 
591
  theme = gr.themes.Soft(
592
  primary_hue="sky",
 
638
  # Model and options
639
  model_choice_subtitles = gr.Dropdown(choices=["whisper-large-v3", "whisper-large-v3-turbo", "distil-whisper-large-v3-en"], value="whisper-large-v3-turbo", label="Audio Speech Recogition (ASR) Model", info="'whisper-large-v3' = Multilingual high quality, 'whisper-large-v3-turbo' = Multilingual fast with minimal impact on quality, good balance, 'distil-whisper-large-v3-en' = English only, fastest with also slight impact on quality")
640
  transcribe_prompt_subtitles = gr.Textbox(label="Prompt (Optional)", info="Specify any context or spelling corrections.")
641
+ timestamp_granularities_str = gr.Dropdown(choices=["word", "segment"], value="word", label="Timestamp Granularities", info="The level of detail of time measurement in the timestamps.")
642
  with gr.Row():
643
  language_subtitles = gr.Dropdown(choices=[(lang, code) for lang, code in LANGUAGE_CODES.items()], value="en", label="Language")
644
  auto_detect_language_subtitles = gr.Checkbox(label="Auto Detect Language")
 
692
  inputs=[
693
  input_file,
694
  transcribe_prompt_subtitles,
695
+ timestamp_granularities_str,
696
  language_subtitles,
697
  auto_detect_language_subtitles,
698
  model_choice_subtitles,