Spaces:

Nick088
/

Fast-Subtitle-Maker

Running

App Files Files Community

Nick088 commited on Mar 30

Commit

caaeb6e

verified ·

1 Parent(s): b667bd4

Add TimeStamp Granularities

Browse files

Files changed (1) hide show

app.py +286 -129

app.py CHANGED Viewed

@@ -243,11 +243,15 @@ def check_file(input_file_path):
 # subtitle maker
-def format_time(seconds):
-    hours = int(seconds // 3600)
-    minutes = int((seconds % 3600) // 60)
-    seconds = int(seconds % 60)
-    milliseconds = int((seconds % 1) * 1000)
     return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}"
@@ -265,173 +269,324 @@ def json_to_srt(transcription_json):
     return '\n'.join(srt_lines)
-def generate_subtitles(input_file, prompt, language, auto_detect_language, model, include_video, font_selection, font_file, font_color, font_size, outline_thickness, outline_color):
     input_file_path = input_file
     processed_path, split_status = check_file(input_file_path)
-    full_srt_content = ""
-    total_duration = 0
-    segment_id_offset = 0
     if split_status == "split":
-        srt_chunks = []
-        video_chunks = []
         for i, chunk_path in enumerate(processed_path):
             try:
                 with open(chunk_path, "rb") as file:
                     transcription_json_response = client.audio.transcriptions.create(
                         file=(os.path.basename(chunk_path), file.read()),
                         model=model,
                         prompt=prompt,
                         response_format="verbose_json",
                         language=None if auto_detect_language else language,
                         temperature=0.0,
                     )
-                transcription_json = transcription_json_response.segments
-                # Adjust timestamps and segment IDs
-                for segment in transcription_json:
-                    segment['start'] += total_duration
-                    segment['end'] += total_duration
-                    segment['id'] += segment_id_offset
-                segment_id_offset += len(transcription_json)
-                total_duration += transcription_json[-1]['end']  # Update total duration
-                srt_content = json_to_srt(transcription_json)
-                full_srt_content += srt_content
-                temp_srt_path = f"{os.path.splitext(chunk_path)[0]}.srt"
-                with open(temp_srt_path, "w", encoding="utf-8") as temp_srt_file:
-                    temp_srt_file.write(srt_content)
-                    temp_srt_file.write("\n") # add a new line at the end of the srt chunk file to fix format when merged
-                srt_chunks.append(temp_srt_path)
-                if include_video and input_file_path.lower().endswith((".mp4", ".webm")):
-                    try:
-                        output_file_path = chunk_path.replace(os.path.splitext(chunk_path)[1], "_with_subs" + os.path.splitext(chunk_path)[1])
-                        # Handle font selection
-                        if font_selection == "Custom Font File" and font_file:
-                            font_name = os.path.splitext(os.path.basename(font_file.name))[0]  # Get font filename without extension
-                            font_dir = os.path.dirname(font_file.name)  # Get font directory path
-                        elif font_selection == "Custom Font File" and not font_file:
-                            font_name = None  # Let FFmpeg use its default Arial
-                            font_dir = None  # No font directory
-                            gr.Warning(f"You want to use a Custom Font File, but uploaded none. Using the default Arial font.")
-                        elif font_selection == "Arial":
-                            font_name = None  # Let FFmpeg use its default Arial
-                            font_dir = None  # No font directory
-                        # FFmpeg command
-                        subprocess.run(
-                            [
-                                "ffmpeg",
-                                "-y",
-                                "-i",
-                                chunk_path,
-                                "-vf",
-                                f"subtitles={temp_srt_path}:fontsdir={font_dir}:force_style='Fontname={font_name},Fontsize={int(font_size)},PrimaryColour=&H{font_color[1:]}&,OutlineColour=&H{outline_color[1:]}&,BorderStyle={int(outline_thickness)},Outline=1'",
-                                "-preset", "fast",
-                                output_file_path,
-                            ],
-                            check=True,
-                        )
-                        video_chunks.append(output_file_path)
-                    except subprocess.CalledProcessError as e:
-                        raise gr.Error(f"Error during subtitle addition: {e}")
-                elif include_video and not input_file_path.lower().endswith((".mp4", ".webm")):
-                    gr.Warning(f"You have checked on the 'Include Video with Subtitles', but the input file {input_file_path} isn't a video (.mp4 or .webm). Returning only the SRT File.", duration=15)
             except groq.AuthenticationError as e:
-                handle_groq_error(e, model)
             except groq.RateLimitError as e:
-                handle_groq_error(e, model)
-                gr.Warning(f"API limit reached during chunk {i+1}. Returning processed chunks only.")
-                if srt_chunks and video_chunks:
-                    merge_audio(video_chunks, 'merged_output_video.mp4')
-                    with open('merged_output.srt', 'w', encoding="utf-8") as outfile:
-                        for chunk_srt in srt_chunks:
-                            with open(chunk_srt, 'r', encoding="utf-8") as infile:
-                                outfile.write(infile.read())
-                    return 'merged_output.srt', 'merged_output_video.mp4'
-                else:
-                    raise gr.Error("Subtitle generation failed due to API limits.")
-        # Merge SRT chunks
-        final_srt_path = os.path.splitext(input_file_path)[0] + "_final.srt"
-        with open(final_srt_path, 'w', encoding="utf-8") as outfile:
-            for chunk_srt in srt_chunks:
-                with open(chunk_srt, 'r', encoding="utf-8") as infile:
-                    outfile.write(infile.read())
-        # Merge video chunks
         if video_chunks:
-            merge_audio(video_chunks, 'merged_output_video.mp4')
-            return final_srt_path, 'merged_output_video.mp4'
-        else:
-            return final_srt_path, None
     else:  # Single file processing (no splitting)
         try:
             with open(processed_path, "rb") as file:
                 transcription_json_response = client.audio.transcriptions.create(
                     file=(os.path.basename(processed_path), file.read()),
                     model=model,
                     prompt=prompt,
                     response_format="verbose_json",
                     language=None if auto_detect_language else language,
                     temperature=0.0,
                 )
-            transcription_json = transcription_json_response.segments
-            srt_content = json_to_srt(transcription_json)
-            temp_srt_path = os.path.splitext(input_file_path)[0] + ".srt"
-            with open(temp_srt_path, "w", encoding="utf-8") as temp_srt_file:
-                temp_srt_file.write(srt_content)
-            if include_video and input_file_path.lower().endswith((".mp4", ".webm")):
-                try:
-                    output_file_path = input_file_path.replace(
-                        os.path.splitext(input_file_path)[1], "_with_subs" + os.path.splitext(input_file_path)[1]
-                    )
-                    # Handle font selection
-                    if font_selection == "Custom Font File" and font_file:
-                        font_name = os.path.splitext(os.path.basename(font_file.name))[0]  # Get font filename without extension
-                        font_dir = os.path.dirname(font_file.name)  # Get font directory path
-                    elif font_selection == "Custom Font File" and not font_file:
-                        font_name = None  # Let FFmpeg use its default Arial
-                        font_dir = None  # No font directory
-                        gr.Warning(f"You want to use a Custom Font File, but uploaded none. Using the default Arial font.")
-                    elif font_selection == "Arial":
-                        font_name = None  # Let FFmpeg use its default Arial
-                        font_dir = None  # No font directory
-                    # FFmpeg command
-                    subprocess.run(
-                        [
-                            "ffmpeg",
-                            "-y",
-                            "-i",
-                            input_file_path,
-                            "-vf",
-                            f"subtitles={temp_srt_path}:fontsdir={font_dir}:force_style='FontName={font_name},Fontsize={int(font_size)},PrimaryColour=&H{font_color[1:]}&,OutlineColour=&H{outline_color[1:]}&,BorderStyle={int(outline_thickness)},Outline=1'",
-                            "-preset", "fast",
-                            output_file_path,
-                        ],
-                        check=True,
-                    )
-                    return temp_srt_path, output_file_path
-                except subprocess.CalledProcessError as e:
-                    raise gr.Error(f"Error during subtitle addition: {e}")
-            elif include_video and not input_file_path.lower().endswith((".mp4", ".webm")):
-                gr.Warning(f"You have checked on the 'Include Video with Subtitles', but the input file {input_file_path} isn't a video (.mp4 or .webm). Returning only the SRT File.", duration=15)
-            return temp_srt_path, None
         except groq.AuthenticationError as e:
             handle_groq_error(e, model)
         except groq.RateLimitError as e:
             handle_groq_error(e, model)
-        except ValueError as e:
-            raise gr.Error(f"Error creating SRT file: {e}")
 theme = gr.themes.Soft(
     primary_hue="sky",
@@ -483,6 +638,7 @@ with gr.Blocks(theme=theme, css=css) as interface:
     # Model and options
     model_choice_subtitles = gr.Dropdown(choices=["whisper-large-v3", "whisper-large-v3-turbo", "distil-whisper-large-v3-en"], value="whisper-large-v3-turbo", label="Audio Speech Recogition (ASR) Model", info="'whisper-large-v3' = Multilingual high quality, 'whisper-large-v3-turbo' = Multilingual fast with minimal impact on quality, good balance, 'distil-whisper-large-v3-en' = English only, fastest with also slight impact on quality")
     transcribe_prompt_subtitles = gr.Textbox(label="Prompt (Optional)", info="Specify any context or spelling corrections.")
     with gr.Row():
         language_subtitles = gr.Dropdown(choices=[(lang, code) for lang, code in LANGUAGE_CODES.items()], value="en", label="Language")
         auto_detect_language_subtitles = gr.Checkbox(label="Auto Detect Language")
@@ -536,6 +692,7 @@ with gr.Blocks(theme=theme, css=css) as interface:
         inputs=[
             input_file,
             transcribe_prompt_subtitles,
             language_subtitles,
             auto_detect_language_subtitles,
             model_choice_subtitles,

 # subtitle maker
+def format_time(seconds_float):
+    # Calculate total whole seconds and milliseconds
+    total_seconds = int(seconds_float)
+    milliseconds = int((seconds_float - total_seconds) * 1000)
+    # Calculate hours, minutes, and remaining seconds
+    hours = total_seconds // 3600
+    minutes = (total_seconds % 3600) // 60
+    seconds = total_seconds % 60
     return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}"
     return '\n'.join(srt_lines)
+def words_json_to_srt(words_data, starting_id=0):
+    srt_lines = []
+    previous_end_time = 0.0  # Keep track of the end time of the previous word
+    for i, word_entry in enumerate(words_data):
+        # Get original start and end times
+        start_seconds = word_entry['start']
+        end_seconds = word_entry['end']
+        # --- Overlap Prevention Logic ---
+        # Ensure the start time is not before the previous word ended
+        start_seconds = max(start_seconds, previous_end_time)
+        # Ensure the end time is not before the start time (can happen with adjustments)
+        # And add a tiny minimum duration (e.g., 50ms) if start and end are identical,
+        # otherwise the subtitle might flash too quickly or be ignored by players.
+        min_duration = 0.050 # 50 milliseconds
+        if end_seconds <= start_seconds:
+             end_seconds = start_seconds + min_duration
+        # --- End of Overlap Prevention ---
+        # Format the potentially adjusted times
+        start_time_fmt = format_time(start_seconds)
+        end_time_fmt = format_time(end_seconds)
+        text = word_entry['word']
+        srt_id = starting_id + i + 1
+        srt_line = f"{srt_id}\n{start_time_fmt} --> {end_time_fmt}\n{text}\n"
+        srt_lines.append(srt_line)
+        # Update previous_end_time for the next iteration using the *adjusted* end time
+        previous_end_time = end_seconds
+    return '\n'.join(srt_lines)
+def generate_subtitles(input_file, prompt, timestamp_granularities_str, language, auto_detect_language, model, include_video, font_selection, font_file, font_color, font_size, outline_thickness, outline_color):
     input_file_path = input_file
     processed_path, split_status = check_file(input_file_path)
+    full_srt_content = "" # Used for accumulating SRT content string for split files
+    srt_chunks_paths = [] # Used to store paths of individual SRT chunk files for merging
+    video_chunks = []     # Used to store paths of video chunks with embedded subs
+    total_duration = 0    # Cumulative duration for timestamp adjustment in split files
+    srt_entry_offset = 0  # Cumulative SRT entry count (words or segments) for ID adjustment
+    # transforms the gradio dropdown choice str to a python list needed for the groq api
+    timestamp_granularities_list = [gran.strip() for gran in timestamp_granularities_str.split(',') if gran.strip()]
+    # Determine primary granularity for logic (prefer word if both specified, else segment)
+    primary_granularity = "word" if "word" in timestamp_granularities_list else "segment"
+    # handling splitted files or single ones
     if split_status == "split":
         for i, chunk_path in enumerate(processed_path):
+            chunk_srt_content = "" # SRT content for the current chunk
+            temp_srt_path = f"{os.path.splitext(chunk_path)[0]}.srt" # Path for this chunk's SRT file
             try:
+                gr.Info(f"Processing chunk {i+1}/{len(processed_path)}...")
                 with open(chunk_path, "rb") as file:
                     transcription_json_response = client.audio.transcriptions.create(
                         file=(os.path.basename(chunk_path), file.read()),
                         model=model,
                         prompt=prompt,
                         response_format="verbose_json",
+                        timestamp_granularities=timestamp_granularities_list,
                         language=None if auto_detect_language else language,
                         temperature=0.0,
                     )
+                if primary_granularity == "word":
+                    word_data = transcription_json_response.words
+                    if word_data:
+                        # Adjust timestamps BEFORE generating SRT
+                        adjusted_word_data = []
+                        for entry in word_data:
+                            adjusted_entry = entry.copy()
+                            adjusted_entry['start'] += total_duration
+                            adjusted_entry['end'] += total_duration
+                            adjusted_word_data.append(adjusted_entry)
+                        # Generate SRT using adjusted data and current offset
+                        chunk_srt_content = words_json_to_srt(adjusted_word_data, srt_entry_offset)
+                        # Update offsets for the *next* chunk
+                        total_duration = adjusted_word_data[-1]['end'] # Use adjusted end time
+                        srt_entry_offset += len(word_data) # Increment by number of words in this chunk
+                    else:
+                         gr.Warning(f"API returned no word timestamps for chunk {i+1}.")
+                elif primary_granularity == "segment":
+                    segment_data = transcription_json_response.segments
+                    if segment_data:
+                        # Adjust timestamps and IDs BEFORE generating SRT
+                        adjusted_segment_data = []
+                        max_original_id = -1
+                        for entry in segment_data:
+                            adjusted_entry = entry.copy()
+                            adjusted_entry['start'] += total_duration
+                            adjusted_entry['end'] += total_duration
+                            max_original_id = max(max_original_id, adjusted_entry['id']) # Track max original ID for offset calc
+                            adjusted_entry['id'] += srt_entry_offset # Adjust ID for SRT generation
+                            adjusted_segment_data.append(adjusted_entry)
+                        # Generate SRT using adjusted data
+                        chunk_srt_content = json_to_srt(adjusted_segment_data) # json_to_srt uses the 'id' field directly
+                        # Update offsets for the *next* chunk
+                        total_duration = adjusted_segment_data[-1]['end'] # Use adjusted end time
+                        srt_entry_offset += (max_original_id + 1) # Increment by number of segments in this chunk (based on original IDs)
+                    else:
+                         gr.Warning(f"API returned no segment timestamps for chunk {i+1}.")
+                else:
+                     # This case should ideally not be reached due to dropdown default/logic
+                     gr.Warning(f"Invalid timestamp granularity for chunk {i+1}. Skipping SRT generation for this chunk.")
+                # Write and store path for this chunk's SRT file if content exists
+                if chunk_srt_content:
+                    with open(temp_srt_path, "w", encoding="utf-8") as temp_srt_file:
+                        temp_srt_file.write(chunk_srt_content)
+                    srt_chunks_paths.append(temp_srt_path)
+                    full_srt_content += chunk_srt_content # Append to the full content string as well
+                    # Video embedding for the chunk
+                    if include_video and input_file_path.lower().endswith((".mp4", ".webm")):
+                        try:
+                            output_video_chunk_path = chunk_path.replace(os.path.splitext(chunk_path)[1], "_with_subs" + os.path.splitext(chunk_path)[1])
+                            # Handle font selection
+                            font_name = None
+                            font_dir = None
+                            if font_selection == "Custom Font File" and font_file:
+                                font_name = os.path.splitext(os.path.basename(font_file.name))[0]
+                                font_dir = os.path.dirname(font_file.name)
+                            elif font_selection == "Custom Font File" and not font_file:
+                                gr.Warning(f"Custom Font File selected but none uploaded. Using default font for chunk {i+1}.")
+                            # FFmpeg command for the chunk
+                            subprocess.run(
+                                [
+                                    "ffmpeg", "-y", "-i", chunk_path,
+                                    "-vf", f"subtitles={temp_srt_path}:fontsdir={font_dir}:force_style='FontName={font_name},Fontsize={int(font_size)},PrimaryColour=&H{font_color[1:]}&,OutlineColour=&H{outline_color[1:]}&,BorderStyle={int(outline_thickness)},Outline=1'",
+                                    "-preset", "fast", output_video_chunk_path,
+                                ], check=True,
+                            )
+                            video_chunks.append(output_video_chunk_path)
+                        except subprocess.CalledProcessError as e:
+                            # Warn but continue processing other chunks
+                            gr.Warning(f"Error adding subtitles to video chunk {i+1}: {e}. Skipping video for this chunk.")
+                        except Exception as e: # Catch other potential errors during font handling etc.
+                            gr.Warning(f"Error preparing subtitle style for video chunk {i+1}: {e}. Skipping video for this chunk.")
+                    elif include_video and i == 0: # Show warning only once for non-video input
+                         gr.Warning(f"Include Video checked, but input isn't MP4/WebM. Only SRT will be generated.", duration=15)
             except groq.AuthenticationError as e:
+                handle_groq_error(e, model) # This will raise gr.Error and stop execution
             except groq.RateLimitError as e:
+                handle_groq_error(e, model) # This will raise gr.Error and stop execution
+            except Exception as e:
+                gr.Warning(f"Error processing chunk {i+1}: {e}. Skipping this chunk.")
+                # Remove potentially incomplete SRT for this chunk if it exists
+                if os.path.exists(temp_srt_path):
+                    try: os.remove(temp_srt_path)
+                    except: pass
+                continue # Move to the next chunk
+        # After processing all chunks
+        final_srt_path = None
+        final_video_path = None
+        # Merge SRT chunks if any were created
+        if srt_chunks_paths:
+            final_srt_path = os.path.splitext(input_file_path)[0] + "_final.srt"
+            gr.Info("Merging SRT chunks...")
+            with open(final_srt_path, 'w', encoding="utf-8") as outfile:
+                 # Use the full_srt_content string which ensures correct order and content
+                 outfile.write(full_srt_content)
+            # Clean up individual srt chunks paths
+            for srt_chunk_file in srt_chunks_paths:
+                 try: os.remove(srt_chunk_file)
+                 except: pass
+            # Clean up intermediate audio chunks used for transcription
+            for chunk in processed_path:
+                try: os.remove(chunk)
+                except: pass
+        else:
+             gr.Warning("No SRT content was generated from any chunk.")
+        # Merge video chunks if any were created
         if video_chunks:
+             # Check if number of video chunks matches expected number based on successful SRT generation
+             if len(video_chunks) != len(srt_chunks_paths):
+                 gr.Warning("Mismatch between successful SRT chunks and video chunks created. Video merge might be incomplete.")
+             final_video_path = os.path.splitext(input_file_path)[0] + '_merged_video_with_subs.mp4' # More descriptive name
+             gr.Info("Merging video chunks...")
+             try:
+                 merge_audio(video_chunks, final_video_path) # Re-using merge_audio logic for video files
+                 # video_chunks are removed inside merge_audio if successful
+             except Exception as e:
+                 gr.Error(f"Failed to merge video chunks: {e}")
+                 final_video_path = None # Indicate failure
+        return final_srt_path, final_video_path
     else:  # Single file processing (no splitting)
+        final_srt_path = None
+        final_video_path = None
+        temp_srt_path = os.path.splitext(processed_path)[0] + ".srt" # Use processed_path for naming
         try:
+            gr.Info("Processing file...")
             with open(processed_path, "rb") as file:
                 transcription_json_response = client.audio.transcriptions.create(
                     file=(os.path.basename(processed_path), file.read()),
                     model=model,
                     prompt=prompt,
                     response_format="verbose_json",
+                    timestamp_granularities=timestamp_granularities_list,
                     language=None if auto_detect_language else language,
                     temperature=0.0,
                 )
+            srt_content = "" # Initialize
+            if primary_granularity == "word":
+                word_data = transcription_json_response.words
+                if word_data:
+                    srt_content = words_json_to_srt(word_data, 0) # Start IDs from 0
+                else:
+                    gr.Warning("API returned no word timestamps.")
+            elif primary_granularity == "segment":
+                segment_data = transcription_json_response.segments
+                if segment_data:
+                    # No need to adjust IDs/timestamps for single file
+                    srt_content = json_to_srt(segment_data)
+                else:
+                     gr.Warning("API returned no segment timestamps.")
+            else:
+                 # Should not happen
+                 gr.Warning("Invalid timestamp granularity selected. Skipping SRT generation.")
+            # Write SRT file if content exists
+            if srt_content:
+                with open(temp_srt_path, "w", encoding="utf-8") as temp_srt_file:
+                    temp_srt_file.write(srt_content)
+                final_srt_path = temp_srt_path # Set the final path
+                # Video embedding logic
+                if include_video and input_file_path.lower().endswith((".mp4", ".webm")):
+                    try:
+                        output_video_path = processed_path.replace(
+                            os.path.splitext(processed_path)[1], "_with_subs" + os.path.splitext(processed_path)[1]
+                        )
+                        # Handle font selection
+                        font_name = None
+                        font_dir = None
+                        if font_selection == "Custom Font File" and font_file:
+                            font_name = os.path.splitext(os.path.basename(font_file.name))[0]
+                            font_dir = os.path.dirname(font_file.name)
+                        elif font_selection == "Custom Font File" and not font_file:
+                            gr.Warning(f"Custom Font File selected but none uploaded. Using default font.")
+                        # FFmpeg command
+                        gr.Info("Adding subtitles to video...")
+                        subprocess.run(
+                            [
+                                "ffmpeg", "-y", "-i", processed_path, # Use processed_path as input
+                                "-vf", f"subtitles={temp_srt_path}:fontsdir={font_dir}:force_style='FontName={font_name},Fontsize={int(font_size)},PrimaryColour=&H{font_color[1:]}&,OutlineColour=&H{outline_color[1:]}&,BorderStyle={int(outline_thickness)},Outline=1'",
+                                "-preset", "fast", output_video_path,
+                            ], check=True,
+                        )
+                        final_video_path = output_video_path
+                    except subprocess.CalledProcessError as e:
+                        gr.Error(f"Error during subtitle addition: {e}")
+                        # Keep SRT file, but no video output
+                        final_video_path = None
+                    except Exception as e:
+                         gr.Error(f"Error preparing subtitle style for video: {e}")
+                         final_video_path = None
+                elif include_video:
+                     # Warning for non-video input shown once
+                     gr.Warning(f"Include Video checked, but input isn't MP4/WebM. Only SRT will be generated.", duration=15)
+                # Clean up downsampled file if it was created and different from original input
+                if processed_path != input_file_path and os.path.exists(processed_path):
+                    try: os.remove(processed_path)
+                    except: pass
+                return final_srt_path, final_video_path # Return paths (video might be None)
+            else: # No SRT content generated
+                gr.Warning("No SRT content could be generated.")
+                # Clean up downsampled file if created
+                if processed_path != input_file_path and os.path.exists(processed_path):
+                    try: os.remove(processed_path)
+                    except: pass
+                return None, None # Return None for both outputs
         except groq.AuthenticationError as e:
             handle_groq_error(e, model)
         except groq.RateLimitError as e:
             handle_groq_error(e, model)
+        except Exception as e: # Catch any other error during single file processing
+             # Clean up downsampled file if created
+            if processed_path != input_file_path and os.path.exists(processed_path):
+                try: os.remove(processed_path)
+                except: pass
+            # Clean up potentially created empty SRT
+            if os.path.exists(temp_srt_path):
+                try: os.remove(temp_srt_path)
+                except: pass
+            raise gr.Error(f"An unexpected error occurred: {e}")
 theme = gr.themes.Soft(
     primary_hue="sky",
     # Model and options
     model_choice_subtitles = gr.Dropdown(choices=["whisper-large-v3", "whisper-large-v3-turbo", "distil-whisper-large-v3-en"], value="whisper-large-v3-turbo", label="Audio Speech Recogition (ASR) Model", info="'whisper-large-v3' = Multilingual high quality, 'whisper-large-v3-turbo' = Multilingual fast with minimal impact on quality, good balance, 'distil-whisper-large-v3-en' = English only, fastest with also slight impact on quality")
     transcribe_prompt_subtitles = gr.Textbox(label="Prompt (Optional)", info="Specify any context or spelling corrections.")
+    timestamp_granularities_str = gr.Dropdown(choices=["word", "segment"], value="word", label="Timestamp Granularities", info="The level of detail of time measurement in the timestamps.")
     with gr.Row():
         language_subtitles = gr.Dropdown(choices=[(lang, code) for lang, code in LANGUAGE_CODES.items()], value="en", label="Language")
         auto_detect_language_subtitles = gr.Checkbox(label="Auto Detect Language")
         inputs=[
             input_file,
             transcribe_prompt_subtitles,
+            timestamp_granularities_str,
             language_subtitles,
             auto_detect_language_subtitles,
             model_choice_subtitles,