refinamento

Running

App Files Files Community

jhj0517 commited on Oct 31, 2024

Commit

1a63918

unverified ·

2 Parent(s): fb62be2 f197459

Merge pull request #366 from jhj0517/feature/enable-word-timestamps

Browse files

Files changed (10) hide show

app.py +1 -1
modules/diarize/diarize_pipeline.py +5 -3
modules/translation/deepl_api.py +16 -26
modules/translation/translation_base.py +18 -27
modules/utils/files_manager.py +6 -0
modules/utils/subtitle_manager.py +410 -114
modules/vad/silero_vad.py +1 -1
modules/whisper/base_transcription_pipeline.py +60 -90
modules/whisper/data_classes.py +52 -9
modules/whisper/faster_whisper_inference.py +1 -5

app.py CHANGED Viewed

@@ -53,7 +53,7 @@ class App:
             dd_lang = gr.Dropdown(choices=self.whisper_inf.available_langs + [AUTOMATIC_DETECTION],
                                   value=AUTOMATIC_DETECTION if whisper_params["lang"] == AUTOMATIC_DETECTION.unwrap()
                                   else whisper_params["lang"], label=_("Language"))
-            dd_file_format = gr.Dropdown(choices=["SRT", "WebVTT", "txt"], value="SRT", label=_("File Format"))
         with gr.Row():
             cb_translate = gr.Checkbox(value=whisper_params["is_translate"], label=_("Translate to English?"),
                                        interactive=True)

             dd_lang = gr.Dropdown(choices=self.whisper_inf.available_langs + [AUTOMATIC_DETECTION],
                                   value=AUTOMATIC_DETECTION if whisper_params["lang"] == AUTOMATIC_DETECTION.unwrap()
                                   else whisper_params["lang"], label=_("Language"))
+            dd_file_format = gr.Dropdown(choices=["SRT", "WebVTT", "txt", "LRC"], value="SRT", label=_("File Format"))
         with gr.Row():
             cb_translate = gr.Checkbox(value=whisper_params["is_translate"], label=_("Translate to English?"),
                                        interactive=True)

modules/diarize/diarize_pipeline.py CHANGED Viewed

@@ -7,6 +7,7 @@ from pyannote.audio import Pipeline
 from typing import Optional, Union
 import torch
 from modules.utils.paths import DIARIZATION_MODELS_DIR
 from modules.diarize.audio_loader import load_audio, SAMPLE_RATE
@@ -44,7 +45,8 @@ class DiarizationPipeline:
 def assign_word_speakers(diarize_df, transcript_result, fill_nearest=False):
     transcript_segments = transcript_result["segments"]
     for seg in transcript_segments:
-        seg = seg.dict()
         # assign speaker to segment (if any)
         diarize_df['intersection'] = np.minimum(diarize_df['end'], seg['end']) - np.maximum(diarize_df['start'],
                                                                                             seg['start'])
@@ -64,7 +66,7 @@ def assign_word_speakers(diarize_df, transcript_result, fill_nearest=False):
             seg["speaker"] = speaker
         # assign speaker to words
-        if 'words' in seg:
             for word in seg['words']:
                 if 'start' in word:
                     diarize_df['intersection'] = np.minimum(diarize_df['end'], word['end']) - np.maximum(
@@ -89,7 +91,7 @@ def assign_word_speakers(diarize_df, transcript_result, fill_nearest=False):
     return transcript_result
-class Segment:
     def __init__(self, start, end, speaker=None):
         self.start = start
         self.end = end

 from typing import Optional, Union
 import torch
+from modules.whisper.data_classes import *
 from modules.utils.paths import DIARIZATION_MODELS_DIR
 from modules.diarize.audio_loader import load_audio, SAMPLE_RATE
 def assign_word_speakers(diarize_df, transcript_result, fill_nearest=False):
     transcript_segments = transcript_result["segments"]
     for seg in transcript_segments:
+        if isinstance(seg, Segment):
+            seg = seg.model_dump()
         # assign speaker to segment (if any)
         diarize_df['intersection'] = np.minimum(diarize_df['end'], seg['end']) - np.maximum(diarize_df['start'],
                                                                                             seg['start'])
             seg["speaker"] = speaker
         # assign speaker to words
+        if 'words' in seg and seg['words'] is not None:
             for word in seg['words']:
                 if 'start' in word:
                     diarize_df['intersection'] = np.minimum(diarize_df['end'], word['end']) - np.maximum(
     return transcript_result
+class DiarizationSegment:
     def __init__(self, start, end, speaker=None):
         self.start = start
         self.end = end

modules/translation/deepl_api.py CHANGED Viewed

@@ -139,37 +139,27 @@ class DeepLAPI:
         )
         files_info = {}
-        for fileobj in fileobjs:
-            file_path = fileobj
-            file_name, file_ext = os.path.splitext(os.path.basename(fileobj))
-            if file_ext == ".srt":
-                parsed_dicts = parse_srt(file_path=file_path)
-            elif file_ext == ".vtt":
-                parsed_dicts = parse_vtt(file_path=file_path)
             batch_size = self.max_text_batch_size
-            for batch_start in range(0, len(parsed_dicts), batch_size):
-                batch_end = min(batch_start + batch_size, len(parsed_dicts))
-                sentences_to_translate = [dic["sentence"] for dic in parsed_dicts[batch_start:batch_end]]
                 translated_texts = self.request_deepl_translate(auth_key, sentences_to_translate, source_lang,
                                                                 target_lang, is_pro)
                 for i, translated_text in enumerate(translated_texts):
-                    parsed_dicts[batch_start + i]["sentence"] = translated_text["text"]
-                progress(batch_end / len(parsed_dicts), desc="Translating..")
-            if file_ext == ".srt":
-                subtitle = get_serialized_srt(parsed_dicts)
-            elif file_ext == ".vtt":
-                subtitle = get_serialized_vtt(parsed_dicts)
-            if add_timestamp:
-                timestamp = datetime.now().strftime("%m%d%H%M%S")
-                file_name += f"-{timestamp}"
-            output_path = os.path.join(self.output_dir, f"{file_name}{file_ext}")
-            write_file(subtitle, output_path)
             files_info[file_name] = {"subtitle": subtitle, "path": output_path}

         )
         files_info = {}
+        for file_path in fileobjs:
+            file_name, file_ext = os.path.splitext(os.path.basename(file_path))
+            writer = get_writer(file_ext, self.output_dir)
+            segments = writer.to_segments(file_path)
             batch_size = self.max_text_batch_size
+            for batch_start in range(0, len(segments), batch_size):
+                progress(batch_start / len(segments), desc="Translating..")
+                sentences_to_translate = [seg.text for seg in segments[batch_start:batch_start+batch_size]]
                 translated_texts = self.request_deepl_translate(auth_key, sentences_to_translate, source_lang,
                                                                 target_lang, is_pro)
                 for i, translated_text in enumerate(translated_texts):
+                    segments[batch_start + i].text = translated_text["text"]
+            subtitle, output_path = generate_file(
+                output_dir=self.output_dir,
+                output_file_name=file_name,
+                output_format=file_ext,
+                result=segments,
+                add_timestamp=add_timestamp
+            )
             files_info[file_name] = {"subtitle": subtitle, "path": output_path}

modules/translation/translation_base.py CHANGED Viewed

@@ -95,32 +95,22 @@ class TranslationBase(ABC):
             files_info = {}
             for fileobj in fileobjs:
                 file_name, file_ext = os.path.splitext(os.path.basename(fileobj))
-                if file_ext == ".srt":
-                    parsed_dicts = parse_srt(file_path=fileobj)
-                    total_progress = len(parsed_dicts)
-                    for index, dic in enumerate(parsed_dicts):
-                        progress(index / total_progress, desc="Translating..")
-                        translated_text = self.translate(dic["sentence"], max_length=max_length)
-                        dic["sentence"] = translated_text
-                    subtitle = get_serialized_srt(parsed_dicts)
-                elif file_ext == ".vtt":
-                    parsed_dicts = parse_vtt(file_path=fileobj)
-                    total_progress = len(parsed_dicts)
-                    for index, dic in enumerate(parsed_dicts):
-                        progress(index / total_progress, desc="Translating..")
-                        translated_text = self.translate(dic["sentence"], max_length=max_length)
-                        dic["sentence"] = translated_text
-                    subtitle = get_serialized_vtt(parsed_dicts)
-                if add_timestamp:
-                    timestamp = datetime.now().strftime("%m%d%H%M%S")
-                    file_name += f"-{timestamp}"
-                output_path = os.path.join(self.output_dir, f"{file_name}{file_ext}")
-                write_file(subtitle, output_path)
-                files_info[file_name] = {"subtitle": subtitle, "path": output_path}
             total_result = ''
             for file_name, info in files_info.items():
@@ -133,7 +123,8 @@ class TranslationBase(ABC):
             return [gr_str, output_file_paths]
         except Exception as e:
-            print(f"Error: {str(e)}")
         finally:
             self.release_cuda_memory()

             files_info = {}
             for fileobj in fileobjs:
                 file_name, file_ext = os.path.splitext(os.path.basename(fileobj))
+                writer = get_writer(file_ext, self.output_dir)
+                segments = writer.to_segments(fileobj)
+                for i, segment in enumerate(segments):
+                    progress(i / len(segments), desc="Translating..")
+                    translated_text = self.translate(segment.text, max_length=max_length)
+                    segment.text = translated_text
+                subtitle, file_path = generate_file(
+                    output_dir=self.output_dir,
+                    output_file_name=file_name,
+                    output_format=file_ext,
+                    result=segments,
+                    add_timestamp=add_timestamp
+                )
+                files_info[file_name] = {"subtitle": subtitle, "path": file_path}
             total_result = ''
             for file_name, info in files_info.items():
             return [gr_str, output_file_paths]
         except Exception as e:
+            print(f"Error translating file: {e}")
+            raise
         finally:
             self.release_cuda_memory()

modules/utils/files_manager.py CHANGED Viewed

@@ -67,3 +67,9 @@ def is_video(file_path):
     video_extensions = ['.mp4', '.mkv', '.avi', '.mov', '.flv', '.wmv', '.webm', '.m4v', '.mpeg', '.mpg', '.3gp']
     extension = os.path.splitext(file_path)[1].lower()
     return extension in video_extensions

     video_extensions = ['.mp4', '.mkv', '.avi', '.mov', '.flv', '.wmv', '.webm', '.m4v', '.mpeg', '.mpg', '.3gp']
     extension = os.path.splitext(file_path)[1].lower()
     return extension in video_extensions
+def read_file(file_path):
+    with open(file_path, "r", encoding="utf-8") as f:
+        subtitle_content = f.read()
+    return subtitle_content

modules/utils/subtitle_manager.py CHANGED Viewed

@@ -1,128 +1,424 @@
-import re
-from modules.whisper.data_classes import Segment
-def timeformat_srt(time):
-    hours = time // 3600
-    minutes = (time - hours * 3600) // 60
-    seconds = time - hours * 3600 - minutes * 60
-    milliseconds = (time - int(time)) * 1000
-    return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d},{int(milliseconds):03d}"
-def timeformat_vtt(time):
-    hours = time // 3600
-    minutes = (time - hours * 3600) // 60
-    seconds = time - hours * 3600 - minutes * 60
-    milliseconds = (time - int(time)) * 1000
-    return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}.{int(milliseconds):03d}"
-def write_file(subtitle, output_file):
-    with open(output_file, 'w', encoding='utf-8') as f:
-        f.write(subtitle)
-def get_srt(segments):
-    if segments and isinstance(segments[0], Segment):
-        segments = [seg.dict() for seg in segments]
-    output = ""
-    for i, segment in enumerate(segments):
-        output += f"{i + 1}\n"
-        output += f"{timeformat_srt(segment['start'])} --> {timeformat_srt(segment['end'])}\n"
-        if segment['text'].startswith(' '):
-            segment['text'] = segment['text'][1:]
-        output += f"{segment['text']}\n\n"
-    return output
-def get_vtt(segments):
-    if segments and isinstance(segments[0], Segment):
-        segments = [seg.dict() for seg in segments]
-    output = "WEBVTT\n\n"
-    for i, segment in enumerate(segments):
-        output += f"{timeformat_vtt(segment['start'])} --> {timeformat_vtt(segment['end'])}\n"
-        if segment['text'].startswith(' '):
-            segment['text'] = segment['text'][1:]
-        output += f"{segment['text']}\n\n"
-    return output
-def get_txt(segments):
-    if segments and isinstance(segments[0], Segment):
-        segments = [seg.dict() for seg in segments]
-    output = ""
-    for i, segment in enumerate(segments):
-        if segment['text'].startswith(' '):
-            segment['text'] = segment['text'][1:]
-        output += f"{segment['text']}\n"
-    return output
-def parse_srt(file_path):
-    """Reads SRT file and returns as dict"""
-    with open(file_path, 'r', encoding='utf-8') as file:
-        srt_data = file.read()
-    data = []
-    blocks = srt_data.split('\n\n')
-    for block in blocks:
-        if block.strip() != '':
-            lines = block.strip().split('\n')
-            index = lines[0]
-            timestamp = lines[1]
-            sentence = ' '.join(lines[2:])
-            data.append({
-                "index": index,
-                "timestamp": timestamp,
-                "sentence": sentence
-            })
-    return data
-def parse_vtt(file_path):
-    """Reads WEBVTT file and returns as dict"""
-    with open(file_path, 'r', encoding='utf-8') as file:
-        webvtt_data = file.read()
-    data = []
-    blocks = webvtt_data.split('\n\n')
-    for block in blocks:
-        if block.strip() != '' and not block.strip().startswith("WEBVTT"):
-            lines = block.strip().split('\n')
-            timestamp = lines[0]
-            sentence = ' '.join(lines[1:])
-            data.append({
-                "timestamp": timestamp,
-                "sentence": sentence
-            })
-    return data
-def get_serialized_srt(dicts):
-    output = ""
-    for dic in dicts:
-        output += f'{dic["index"]}\n'
-        output += f'{dic["timestamp"]}\n'
-        output += f'{dic["sentence"]}\n\n'
-    return output
-def get_serialized_vtt(dicts):
-    output = "WEBVTT\n\n"
-    for dic in dicts:
-        output += f'{dic["timestamp"]}\n'
-        output += f'{dic["sentence"]}\n\n'
-    return output
 def safe_filename(name):

+# Ported from https://github.com/openai/whisper/blob/main/whisper/utils.py
+import json
+import os
+import re
+import sys
+import zlib
+from typing import Callable, List, Optional, TextIO, Union, Dict, Tuple
+from datetime import datetime
+from modules.whisper.data_classes import Segment, Word
+from .files_manager import read_file
+def format_timestamp(
+    seconds: float, always_include_hours: bool = True, decimal_marker: str = ","
+) -> str:
+    assert seconds >= 0, "non-negative timestamp expected"
+    milliseconds = round(seconds * 1000.0)
+    hours = milliseconds // 3_600_000
+    milliseconds -= hours * 3_600_000
+    minutes = milliseconds // 60_000
+    milliseconds -= minutes * 60_000
+    seconds = milliseconds // 1_000
+    milliseconds -= seconds * 1_000
+    hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
+    return (
+        f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
+    )
+def time_str_to_seconds(time_str: str, decimal_marker: str = ",") -> float:
+    times = time_str.split(":")
+    if len(times) == 3:
+        hours, minutes, rest = times
+        hours = int(hours)
+    else:
+        hours = 0
+        minutes, rest = times
+    seconds, fractional = rest.split(decimal_marker)
+    minutes = int(minutes)
+    seconds = int(seconds)
+    fractional_seconds = float("0." + fractional)
+    return hours * 3600 + minutes * 60 + seconds + fractional_seconds
+def get_start(segments: List[dict]) -> Optional[float]:
+    return next(
+        (w["start"] for s in segments for w in s["words"]),
+        segments[0]["start"] if segments else None,
+    )
+def get_end(segments: List[dict]) -> Optional[float]:
+    return next(
+        (w["end"] for s in reversed(segments) for w in reversed(s["words"])),
+        segments[-1]["end"] if segments else None,
+    )
+class ResultWriter:
+    extension: str
+    def __init__(self, output_dir: str):
+        self.output_dir = output_dir
+    def __call__(
+        self, result: Union[dict, List[Segment]], output_file_name: str,
+            options: Optional[dict] = None, **kwargs
+    ):
+        if isinstance(result, List) and result and isinstance(result[0], Segment):
+            result = {"segments": [seg.model_dump() for seg in result]}
+        output_path = os.path.join(
+            self.output_dir, output_file_name + "." + self.extension
+        )
+        with open(output_path, "w", encoding="utf-8") as f:
+            self.write_result(result, file=f, options=options, **kwargs)
+    def write_result(
+        self, result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
+    ):
+        raise NotImplementedError
+class WriteTXT(ResultWriter):
+    extension: str = "txt"
+    def write_result(
+        self, result: Union[Dict, List[Segment]], file: TextIO, options: Optional[dict] = None, **kwargs
+    ):
+        for segment in result["segments"]:
+            print(segment["text"].strip(), file=file, flush=True)
+class SubtitlesWriter(ResultWriter):
+    always_include_hours: bool
+    decimal_marker: str
+    def iterate_result(
+        self,
+        result: dict,
+        options: Optional[dict] = None,
+        *,
+        max_line_width: Optional[int] = None,
+        max_line_count: Optional[int] = None,
+        highlight_words: bool = False,
+        align_lrc_words: bool = False,
+        max_words_per_line: Optional[int] = None,
+    ):
+        options = options or {}
+        max_line_width = max_line_width or options.get("max_line_width")
+        max_line_count = max_line_count or options.get("max_line_count")
+        highlight_words = highlight_words or options.get("highlight_words", False)
+        align_lrc_words = align_lrc_words or options.get("align_lrc_words", False)
+        max_words_per_line = max_words_per_line or options.get("max_words_per_line")
+        preserve_segments = max_line_count is None or max_line_width is None
+        max_line_width = max_line_width or 1000
+        max_words_per_line = max_words_per_line or 1000
+        def iterate_subtitles():
+            line_len = 0
+            line_count = 1
+            # the next subtitle to yield (a list of word timings with whitespace)
+            subtitle: List[dict] = []
+            last: float = get_start(result["segments"]) or 0.0
+            for segment in result["segments"]:
+                chunk_index = 0
+                words_count = max_words_per_line
+                while chunk_index < len(segment["words"]):
+                    remaining_words = len(segment["words"]) - chunk_index
+                    if max_words_per_line > len(segment["words"]) - chunk_index:
+                        words_count = remaining_words
+                    for i, original_timing in enumerate(
+                        segment["words"][chunk_index : chunk_index + words_count]
+                    ):
+                        timing = original_timing.copy()
+                        long_pause = (
+                            not preserve_segments and timing["start"] - last > 3.0
+                        )
+                        has_room = line_len + len(timing["word"]) <= max_line_width
+                        seg_break = i == 0 and len(subtitle) > 0 and preserve_segments
+                        if (
+                            line_len > 0
+                            and has_room
+                            and not long_pause
+                            and not seg_break
+                        ):
+                            # line continuation
+                            line_len += len(timing["word"])
+                        else:
+                            # new line
+                            timing["word"] = timing["word"].strip()
+                            if (
+                                len(subtitle) > 0
+                                and max_line_count is not None
+                                and (long_pause or line_count >= max_line_count)
+                                or seg_break
+                            ):
+                                # subtitle break
+                                yield subtitle
+                                subtitle = []
+                                line_count = 1
+                            elif line_len > 0:
+                                # line break
+                                line_count += 1
+                                timing["word"] = "\n" + timing["word"]
+                            line_len = len(timing["word"].strip())
+                        subtitle.append(timing)
+                        last = timing["start"]
+                    chunk_index += max_words_per_line
+            if len(subtitle) > 0:
+                yield subtitle
+        if len(result["segments"]) > 0 and "words" in result["segments"][0] and result["segments"][0]["words"]:
+            for subtitle in iterate_subtitles():
+                subtitle_start = self.format_timestamp(subtitle[0]["start"])
+                subtitle_end = self.format_timestamp(subtitle[-1]["end"])
+                subtitle_text = "".join([word["word"] for word in subtitle])
+                if highlight_words:
+                    last = subtitle_start
+                    all_words = [timing["word"] for timing in subtitle]
+                    for i, this_word in enumerate(subtitle):
+                        start = self.format_timestamp(this_word["start"])
+                        end = self.format_timestamp(this_word["end"])
+                        if last != start:
+                            yield last, start, subtitle_text
+                        yield start, end, "".join(
+                            [
+                                re.sub(r"^(\s*)(.*)$", r"\1<u>\2</u>", word)
+                                if j == i
+                                else word
+                                for j, word in enumerate(all_words)
+                            ]
+                        )
+                        last = end
+                if align_lrc_words:
+                    lrc_aligned_words = [f"[{self.format_timestamp(sub['start'])}]{sub['word']}" for sub in subtitle]
+                    l_start, l_end = self.format_timestamp(subtitle[-1]['start']), self.format_timestamp(subtitle[-1]['end'])
+                    lrc_aligned_words[-1] = f"[{l_start}]{subtitle[-1]['word']}[{l_end}]"
+                    lrc_aligned_words = ' '.join(lrc_aligned_words)
+                    yield None, None, lrc_aligned_words
+                else:
+                    yield subtitle_start, subtitle_end, subtitle_text
+        else:
+            for segment in result["segments"]:
+                segment_start = self.format_timestamp(segment["start"])
+                segment_end = self.format_timestamp(segment["end"])
+                segment_text = segment["text"].strip().replace("-->", "->")
+                yield segment_start, segment_end, segment_text
+    def format_timestamp(self, seconds: float):
+        return format_timestamp(
+            seconds=seconds,
+            always_include_hours=self.always_include_hours,
+            decimal_marker=self.decimal_marker,
+        )
+class WriteVTT(SubtitlesWriter):
+    extension: str = "vtt"
+    always_include_hours: bool = False
+    decimal_marker: str = "."
+    def write_result(
+        self, result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
+    ):
+        print("WEBVTT\n", file=file)
+        for start, end, text in self.iterate_result(result, options, **kwargs):
+            print(f"{start} --> {end}\n{text}\n", file=file, flush=True)
+    def to_segments(self, file_path: str) -> List[Segment]:
+        segments = []
+        blocks = read_file(file_path).split('\n\n')
+        for block in blocks:
+            if block.strip() != '' and not block.strip().startswith("WEBVTT"):
+                lines = block.strip().split('\n')
+                time_line = lines[0].split(" --> ")
+                start, end = time_str_to_seconds(time_line[0], self.decimal_marker), time_str_to_seconds(time_line[1], self.decimal_marker)
+                sentence = ' '.join(lines[1:])
+                segments.append(Segment(
+                    start=start,
+                    end=end,
+                    text=sentence
+                ))
+        return segments
+class WriteSRT(SubtitlesWriter):
+    extension: str = "srt"
+    always_include_hours: bool = True
+    decimal_marker: str = ","
+    def write_result(
+        self, result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
+    ):
+        for i, (start, end, text) in enumerate(
+            self.iterate_result(result, options, **kwargs), start=1
+        ):
+            print(f"{i}\n{start} --> {end}\n{text}\n", file=file, flush=True)
+    def to_segments(self, file_path: str) -> List[Segment]:
+        segments = []
+        blocks = read_file(file_path).split('\n\n')
+        for block in blocks:
+            if block.strip() != '':
+                lines = block.strip().split('\n')
+                index = lines[0]
+                time_line = lines[1].split(" --> ")
+                start, end = time_str_to_seconds(time_line[0], self.decimal_marker), time_str_to_seconds(time_line[1], self.decimal_marker)
+                sentence = ' '.join(lines[2:])
+                segments.append(Segment(
+                    start=start,
+                    end=end,
+                    text=sentence
+                ))
+        return segments
+class WriteLRC(SubtitlesWriter):
+    extension: str = "lrc"
+    always_include_hours: bool = False
+    decimal_marker: str = "."
+    def write_result(
+        self, result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
+    ):
+        for i, (start, end, text) in enumerate(
+            self.iterate_result(result, options, **kwargs), start=1
+        ):
+            if "align_lrc_words" in kwargs and kwargs["align_lrc_words"]:
+                print(f"{text}\n", file=file, flush=True)
+            else:
+                print(f"[{start}]{text}[{end}]\n", file=file, flush=True)
+    def to_segments(self, file_path: str) -> List[Segment]:
+        segments = []
+        blocks = read_file(file_path).split('\n')
+        for block in blocks:
+            if block.strip() != '':
+                lines = block.strip()
+                pattern = r'(\[.*?\])'
+                parts = re.split(pattern, lines)
+                parts = [part.strip() for part in parts if part]
+                for i, part in enumerate(parts):
+                    sentence_i = i%2
+                    if sentence_i == 1:
+                        start_str, text, end_str = parts[sentence_i-1], parts[sentence_i], parts[sentence_i+1]
+                        start_str, end_str = start_str.replace("[", "").replace("]", ""), end_str.replace("[", "").replace("]", "")
+                        start, end = time_str_to_seconds(start_str, self.decimal_marker), time_str_to_seconds(end_str, self.decimal_marker)
+                        segments.append(Segment(
+                            start=start,
+                            end=end,
+                            text=text,
+                        ))
+        return segments
+class WriteTSV(ResultWriter):
+    """
+    Write a transcript to a file in TSV (tab-separated values) format containing lines like:
+    <start time in integer milliseconds>\t<end time in integer milliseconds>\t<transcript text>
+    Using integer milliseconds as start and end times means there's no chance of interference from
+    an environment setting a language encoding that causes the decimal in a floating point number
+    to appear as a comma; also is faster and more efficient to parse & store, e.g., in C++.
+    """
+    extension: str = "tsv"
+    def write_result(
+        self, result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
+    ):
+        print("start", "end", "text", sep="\t", file=file)
+        for segment in result["segments"]:
+            print(round(1000 * segment["start"]), file=file, end="\t")
+            print(round(1000 * segment["end"]), file=file, end="\t")
+            print(segment["text"].strip().replace("\t", " "), file=file, flush=True)
+class WriteJSON(ResultWriter):
+    extension: str = "json"
+    def write_result(
+        self, result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
+    ):
+        json.dump(result, file)
+def get_writer(
+    output_format: str, output_dir: str
+) -> Callable[[dict, TextIO, dict], None]:
+    output_format = output_format.strip().lower().replace(".", "")
+    writers = {
+        "txt": WriteTXT,
+        "vtt": WriteVTT,
+        "srt": WriteSRT,
+        "tsv": WriteTSV,
+        "json": WriteJSON,
+        "lrc": WriteLRC
+    }
+    if output_format == "all":
+        all_writers = [writer(output_dir) for writer in writers.values()]
+        def write_all(
+            result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
+        ):
+            for writer in all_writers:
+                writer(result, file, options, **kwargs)
+        return write_all
+    return writers[output_format](output_dir)
+def generate_file(
+    output_format: str, output_dir: str, result: Union[dict, List[Segment]], output_file_name: str,
+    add_timestamp: bool = True, **kwargs
+) -> Tuple[str, str]:
+    output_format = output_format.strip().lower().replace(".", "")
+    if add_timestamp:
+        timestamp = datetime.now().strftime("%m%d%H%M%S")
+        output_file_name += f"-{timestamp}"
+    file_path = os.path.join(output_dir, f"{output_file_name}.{output_format}")
+    file_writer = get_writer(output_format=output_format, output_dir=output_dir)
+    if isinstance(file_writer, WriteLRC) and kwargs.get("highlight_words", False):
+        kwargs["highlight_words"], kwargs["align_lrc_words"] = False, True
+    file_writer(result=result, output_file_name=output_file_name, **kwargs)
+    content = read_file(file_path)
+    return content, file_path
 def safe_filename(name):

modules/vad/silero_vad.py CHANGED Viewed

@@ -259,7 +259,7 @@ class SileroVAD:
         for segment in segments:
             segment.start = ts_map.get_original_time(segment.start)
-            segment.start = ts_map.get_original_time(segment.start)
         return segments

         for segment in segments:
             segment.start = ts_map.get_original_time(segment.start)
+            segment.end = ts_map.get_original_time(segment.end)
         return segments

modules/whisper/base_transcription_pipeline.py CHANGED Viewed

@@ -1,6 +1,4 @@
 import os
-import torch
-import ast
 import whisper
 import ctranslate2
 import gradio as gr
@@ -10,15 +8,14 @@ from typing import BinaryIO, Union, Tuple, List
 import numpy as np
 from datetime import datetime
 from faster_whisper.vad import VadOptions
-from dataclasses import astuple
 from modules.uvr.music_separator import MusicSeparator
 from modules.utils.paths import (WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, DEFAULT_PARAMETERS_CONFIG_PATH,
                                  UVR_MODELS_DIR)
 from modules.utils.constants import *
-from modules.utils.subtitle_manager import get_srt, get_vtt, get_txt, write_file, safe_filename
 from modules.utils.youtube_manager import get_ytdata, get_ytaudio
-from modules.utils.files_manager import get_media_files, format_gradio_files, load_yaml, save_yaml
 from modules.whisper.data_classes import *
 from modules.diarize.diarizer import Diarizer
 from modules.vad.silero_vad import SileroVAD
@@ -76,7 +73,7 @@ class BaseTranscriptionPipeline(ABC):
             progress: gr.Progress = gr.Progress(),
             add_timestamp: bool = True,
             *pipeline_params,
-            ) -> Tuple[List[dict], float]:
         """
         Run transcription with conditional pre-processing and post-processing.
         The VAD will be performed to remove noise from the audio input in pre-processing, if enabled.
@@ -92,12 +89,14 @@ class BaseTranscriptionPipeline(ABC):
         add_timestamp: bool
             Whether to add a timestamp at the end of the filename.
         *pipeline_params: tuple
-            Parameters for the transcription pipeline. This will be dealt with "TranscriptionPipelineParams" data class
         Returns
         ----------
-        segments_result: List[dict]
-            list of dicts that includes start, end timestamps and transcribed text
         elapsed_time: float
             elapsed time for running
         """
@@ -179,7 +178,7 @@ class BaseTranscriptionPipeline(ABC):
                         file_format: str = "SRT",
                         add_timestamp: bool = True,
                         progress=gr.Progress(),
-                        *params,
                         ) -> list:
         """
         Write subtitle file from Files
@@ -197,7 +196,7 @@ class BaseTranscriptionPipeline(ABC):
             Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the subtitle filename.
         progress: gr.Progress
             Indicator to show progress directly in gradio.
-        *params: tuple
             Parameters for the transcription pipeline. This will be dealt with "TranscriptionPipelineParams" data class
         Returns
@@ -208,6 +207,11 @@ class BaseTranscriptionPipeline(ABC):
             Output file path to return to gr.Files()
         """
         try:
             if input_folder_path:
                 files = get_media_files(input_folder_path)
             if isinstance(files, str):
@@ -221,18 +225,19 @@ class BaseTranscriptionPipeline(ABC):
                     file,
                     progress,
                     add_timestamp,
-                    *params,
                 )
                 file_name, file_ext = os.path.splitext(os.path.basename(file))
-                subtitle, file_path = self.generate_and_write_file(
-                    file_name=file_name,
-                    transcribed_segments=transcribed_segments,
                     add_timestamp=add_timestamp,
-                    file_format=file_format,
-                    output_dir=self.output_dir
                 )
-                files_info[file_name] = {"subtitle": subtitle, "time_for_task": time_for_task, "path": file_path}
             total_result = ''
             total_time = 0
@@ -249,6 +254,7 @@ class BaseTranscriptionPipeline(ABC):
         except Exception as e:
             print(f"Error transcribing file: {e}")
         finally:
             self.release_cuda_memory()
@@ -257,7 +263,7 @@ class BaseTranscriptionPipeline(ABC):
                        file_format: str = "SRT",
                        add_timestamp: bool = True,
                        progress=gr.Progress(),
-                       *whisper_params,
                        ) -> list:
         """
         Write subtitle file from microphone
@@ -272,7 +278,7 @@ class BaseTranscriptionPipeline(ABC):
             Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
         progress: gr.Progress
             Indicator to show progress directly in gradio.
-        *whisper_params: tuple
             Parameters related with whisper. This will be dealt with "WhisperParameters" data class
         Returns
@@ -283,27 +289,35 @@ class BaseTranscriptionPipeline(ABC):
             Output file path to return to gr.Files()
         """
         try:
             progress(0, desc="Loading Audio..")
             transcribed_segments, time_for_task = self.run(
                 mic_audio,
                 progress,
                 add_timestamp,
-                *whisper_params,
             )
             progress(1, desc="Completed!")
-            subtitle, result_file_path = self.generate_and_write_file(
-                file_name="Mic",
-                transcribed_segments=transcribed_segments,
                 add_timestamp=add_timestamp,
-                file_format=file_format,
-                output_dir=self.output_dir
             )
             result_str = f"Done in {self.format_time(time_for_task)}! Subtitle file is in the outputs folder.\n\n{subtitle}"
-            return [result_str, result_file_path]
         except Exception as e:
-            print(f"Error transcribing file: {e}")
         finally:
             self.release_cuda_memory()
@@ -312,7 +326,7 @@ class BaseTranscriptionPipeline(ABC):
                            file_format: str = "SRT",
                            add_timestamp: bool = True,
                            progress=gr.Progress(),
-                           *whisper_params,
                            ) -> list:
         """
         Write subtitle file from Youtube
@@ -327,7 +341,7 @@ class BaseTranscriptionPipeline(ABC):
             Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
         progress: gr.Progress
             Indicator to show progress directly in gradio.
-        *whisper_params: tuple
             Parameters related with whisper. This will be dealt with "WhisperParameters" data class
         Returns
@@ -338,6 +352,11 @@ class BaseTranscriptionPipeline(ABC):
             Output file path to return to gr.Files()
         """
         try:
             progress(0, desc="Loading Audio from Youtube..")
             yt = get_ytdata(youtube_link)
             audio = get_ytaudio(yt)
@@ -346,28 +365,31 @@ class BaseTranscriptionPipeline(ABC):
                 audio,
                 progress,
                 add_timestamp,
-                *whisper_params,
             )
             progress(1, desc="Completed!")
             file_name = safe_filename(yt.title)
-            subtitle, result_file_path = self.generate_and_write_file(
-                file_name=file_name,
-                transcribed_segments=transcribed_segments,
                 add_timestamp=add_timestamp,
-                file_format=file_format,
-                output_dir=self.output_dir
             )
             result_str = f"Done in {self.format_time(time_for_task)}! Subtitle file is in the outputs folder.\n\n{subtitle}"
             if os.path.exists(audio):
                 os.remove(audio)
-            return [result_str, result_file_path]
         except Exception as e:
-            print(f"Error transcribing file: {e}")
         finally:
             self.release_cuda_memory()
@@ -385,58 +407,6 @@ class BaseTranscriptionPipeline(ABC):
         else:
             return list(ctranslate2.get_supported_compute_types("cpu"))
-    @staticmethod
-    def generate_and_write_file(file_name: str,
-                                transcribed_segments: list,
-                                add_timestamp: bool,
-                                file_format: str,
-                                output_dir: str
-                                ) -> str:
-        """
-        Writes subtitle file
-        Parameters
-        ----------
-        file_name: str
-            Output file name
-        transcribed_segments: list
-            Text segments transcribed from audio
-        add_timestamp: bool
-            Determines whether to add a timestamp to the end of the filename.
-        file_format: str
-            File format to write. Supported formats: [SRT, WebVTT, txt]
-        output_dir: str
-            Directory path of the output
-        Returns
-        ----------
-        content: str
-            Result of the transcription
-        output_path: str
-            output file path
-        """
-        if add_timestamp:
-            timestamp = datetime.now().strftime("%m%d%H%M%S")
-            output_path = os.path.join(output_dir, f"{file_name}-{timestamp}")
-        else:
-            output_path = os.path.join(output_dir, f"{file_name}")
-        file_format = file_format.strip().lower()
-        if file_format == "srt":
-            content = get_srt(transcribed_segments)
-            output_path += '.srt'
-        elif file_format == "webvtt":
-            content = get_vtt(transcribed_segments)
-            output_path += '.vtt'
-        elif file_format == "txt":
-            content = get_txt(transcribed_segments)
-            output_path += '.txt'
-        write_file(content, output_path)
-        return content, output_path
     @staticmethod
     def format_time(elapsed_time: float) -> str:
         """

 import os
 import whisper
 import ctranslate2
 import gradio as gr
 import numpy as np
 from datetime import datetime
 from faster_whisper.vad import VadOptions
 from modules.uvr.music_separator import MusicSeparator
 from modules.utils.paths import (WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, DEFAULT_PARAMETERS_CONFIG_PATH,
                                  UVR_MODELS_DIR)
 from modules.utils.constants import *
+from modules.utils.subtitle_manager import *
 from modules.utils.youtube_manager import get_ytdata, get_ytaudio
+from modules.utils.files_manager import get_media_files, format_gradio_files, load_yaml, save_yaml, read_file
 from modules.whisper.data_classes import *
 from modules.diarize.diarizer import Diarizer
 from modules.vad.silero_vad import SileroVAD
             progress: gr.Progress = gr.Progress(),
             add_timestamp: bool = True,
             *pipeline_params,
+            ) -> Tuple[List[Segment], float]:
         """
         Run transcription with conditional pre-processing and post-processing.
         The VAD will be performed to remove noise from the audio input in pre-processing, if enabled.
         add_timestamp: bool
             Whether to add a timestamp at the end of the filename.
         *pipeline_params: tuple
+            Parameters for the transcription pipeline. This will be dealt with "TranscriptionPipelineParams" data class.
+            This must be provided as a List with * wildcard because of the integration with gradio.
+            See more info at : https://github.com/gradio-app/gradio/issues/2471
         Returns
         ----------
+        segments_result: List[Segment]
+            list of Segment that includes start, end timestamps and transcribed text
         elapsed_time: float
             elapsed time for running
         """
                         file_format: str = "SRT",
                         add_timestamp: bool = True,
                         progress=gr.Progress(),
+                        *pipeline_params,
                         ) -> list:
         """
         Write subtitle file from Files
             Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the subtitle filename.
         progress: gr.Progress
             Indicator to show progress directly in gradio.
+        *pipeline_params: tuple
             Parameters for the transcription pipeline. This will be dealt with "TranscriptionPipelineParams" data class
         Returns
             Output file path to return to gr.Files()
         """
         try:
+            params = TranscriptionPipelineParams.from_list(list(pipeline_params))
+            writer_options = {
+                "highlight_words": True if params.whisper.word_timestamps else False
+            }
             if input_folder_path:
                 files = get_media_files(input_folder_path)
             if isinstance(files, str):
                     file,
                     progress,
                     add_timestamp,
+                    *pipeline_params,
                 )
                 file_name, file_ext = os.path.splitext(os.path.basename(file))
+                subtitle, file_path = generate_file(
+                    output_dir=self.output_dir,
+                    output_file_name=file_name,
+                    output_format=file_format,
+                    result=transcribed_segments,
                     add_timestamp=add_timestamp,
+                    **writer_options
                 )
+                files_info[file_name] = {"subtitle": read_file(file_path), "time_for_task": time_for_task, "path": file_path}
             total_result = ''
             total_time = 0
         except Exception as e:
             print(f"Error transcribing file: {e}")
+            raise
         finally:
             self.release_cuda_memory()
                        file_format: str = "SRT",
                        add_timestamp: bool = True,
                        progress=gr.Progress(),
+                       *pipeline_params,
                        ) -> list:
         """
         Write subtitle file from microphone
             Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
         progress: gr.Progress
             Indicator to show progress directly in gradio.
+        *pipeline_params: tuple
             Parameters related with whisper. This will be dealt with "WhisperParameters" data class
         Returns
             Output file path to return to gr.Files()
         """
         try:
+            params = TranscriptionPipelineParams.from_list(list(pipeline_params))
+            writer_options = {
+                "highlight_words": True if params.whisper.word_timestamps else False
+            }
             progress(0, desc="Loading Audio..")
             transcribed_segments, time_for_task = self.run(
                 mic_audio,
                 progress,
                 add_timestamp,
+                *pipeline_params,
             )
             progress(1, desc="Completed!")
+            file_name = "Mic"
+            subtitle, file_path = generate_file(
+                output_dir=self.output_dir,
+                output_file_name=file_name,
+                output_format=file_format,
+                result=transcribed_segments,
                 add_timestamp=add_timestamp,
+                **writer_options
             )
             result_str = f"Done in {self.format_time(time_for_task)}! Subtitle file is in the outputs folder.\n\n{subtitle}"
+            return [result_str, file_path]
         except Exception as e:
+            print(f"Error transcribing mic: {e}")
+            raise
         finally:
             self.release_cuda_memory()
                            file_format: str = "SRT",
                            add_timestamp: bool = True,
                            progress=gr.Progress(),
+                           *pipeline_params,
                            ) -> list:
         """
         Write subtitle file from Youtube
             Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
         progress: gr.Progress
             Indicator to show progress directly in gradio.
+        *pipeline_params: tuple
             Parameters related with whisper. This will be dealt with "WhisperParameters" data class
         Returns
             Output file path to return to gr.Files()
         """
         try:
+            params = TranscriptionPipelineParams.from_list(list(pipeline_params))
+            writer_options = {
+                "highlight_words": True if params.whisper.word_timestamps else False
+            }
             progress(0, desc="Loading Audio from Youtube..")
             yt = get_ytdata(youtube_link)
             audio = get_ytaudio(yt)
                 audio,
                 progress,
                 add_timestamp,
+                *pipeline_params,
             )
             progress(1, desc="Completed!")
             file_name = safe_filename(yt.title)
+            subtitle, file_path = generate_file(
+                output_dir=self.output_dir,
+                output_file_name=file_name,
+                output_format=file_format,
+                result=transcribed_segments,
                 add_timestamp=add_timestamp,
+                **writer_options
             )
             result_str = f"Done in {self.format_time(time_for_task)}! Subtitle file is in the outputs folder.\n\n{subtitle}"
             if os.path.exists(audio):
                 os.remove(audio)
+            return [result_str, file_path]
         except Exception as e:
+            print(f"Error transcribing youtube: {e}")
+            raise
         finally:
             self.release_cuda_memory()
         else:
             return list(ctranslate2.get_supported_compute_types("cpu"))
     @staticmethod
     def format_time(elapsed_time: float) -> str:
         """

modules/whisper/data_classes.py CHANGED Viewed

@@ -1,10 +1,12 @@
 import gradio as gr
 import torch
-from typing import Optional, Dict, List, Union
 from pydantic import BaseModel, Field, field_validator, ConfigDict
 from gradio_i18n import Translate, gettext as _
 from enum import Enum
 from copy import deepcopy
 import yaml
 from modules.utils.constants import *
@@ -17,12 +19,53 @@ class WhisperImpl(Enum):
 class Segment(BaseModel):
-    text: Optional[str] = Field(default=None,
-                                description="Transcription text of the segment")
-    start: Optional[float] = Field(default=None,
-                                   description="Start time of the segment")
-    end: Optional[float] = Field(default=None,
-                                 description="End time of the segment")
 class BaseParams(BaseModel):
@@ -250,9 +293,9 @@ class WhisperParams(BaseParams):
         default=True,
         description="Suppress blank outputs at start of sampling"
     )
-    suppress_tokens: Optional[Union[List, str]] = Field(default=[-1], description="Token IDs to suppress")
     max_initial_timestamp: float = Field(
-        default=0.0,
         ge=0.0,
         description="Maximum initial timestamp"
     )

+import faster_whisper.transcribe
 import gradio as gr
 import torch
+from typing import Optional, Dict, List, Union, NamedTuple
 from pydantic import BaseModel, Field, field_validator, ConfigDict
 from gradio_i18n import Translate, gettext as _
 from enum import Enum
 from copy import deepcopy
 import yaml
 from modules.utils.constants import *
 class Segment(BaseModel):
+    id: Optional[int] = Field(default=None, description="Incremental id for the segment")
+    seek: Optional[int] = Field(default=None, description="Seek of the segment from chunked audio")
+    text: Optional[str] = Field(default=None, description="Transcription text of the segment")
+    start: Optional[float] = Field(default=None, description="Start time of the segment")
+    end: Optional[float] = Field(default=None, description="End time of the segment")
+    tokens: Optional[List[int]] = Field(default=None, description="List of token IDs")
+    temperature: Optional[float] = Field(default=None, description="Temperature used during the decoding process")
+    avg_logprob: Optional[float] = Field(default=None, description="Average log probability of the tokens")
+    compression_ratio: Optional[float] = Field(default=None, description="Compression ratio of the segment")
+    no_speech_prob: Optional[float] = Field(default=None, description="Probability that it's not speech")
+    words: Optional[List['Word']] = Field(default=None, description="List of words contained in the segment")
+    @classmethod
+    def from_faster_whisper(cls,
+                            seg: faster_whisper.transcribe.Segment):
+        if seg.words is not None:
+            words = [
+                Word(
+                    start=w.start,
+                    end=w.end,
+                    word=w.word,
+                    probability=w.probability
+                ) for w in seg.words
+            ]
+        else:
+            words = None
+        return cls(
+            id=seg.id,
+            seek=seg.seek,
+            text=seg.text,
+            start=seg.start,
+            end=seg.end,
+            tokens=seg.tokens,
+            temperature=seg.temperature,
+            avg_logprob=seg.avg_logprob,
+            compression_ratio=seg.compression_ratio,
+            no_speech_prob=seg.no_speech_prob,
+            words=words
+        )
+class Word(BaseModel):
+    start: Optional[float] = Field(default=None, description="Start time of the word")
+    end: Optional[float] = Field(default=None, description="Start time of the word")
+    word: Optional[str] = Field(default=None, description="Word text")
+    probability: Optional[float] = Field(default=None, description="Probability of the word")
 class BaseParams(BaseModel):
         default=True,
         description="Suppress blank outputs at start of sampling"
     )
+    suppress_tokens: Optional[Union[List[int], str]] = Field(default=[-1], description="Token IDs to suppress")
     max_initial_timestamp: float = Field(
+        default=1.0,
         ge=0.0,
         description="Maximum initial timestamp"
     )

modules/whisper/faster_whisper_inference.py CHANGED Viewed

@@ -102,11 +102,7 @@ class FasterWhisperInference(BaseTranscriptionPipeline):
         segments_result = []
         for segment in segments:
             progress(segment.start / info.duration, desc="Transcribing..")
-            segments_result.append(Segment(
-                start=segment.start,
-                end=segment.end,
-                text=segment.text
-            ))
         elapsed_time = time.time() - start_time
         return segments_result, elapsed_time

         segments_result = []
         for segment in segments:
             progress(segment.start / info.duration, desc="Transcribing..")
+            segments_result.append(Segment.from_faster_whisper(segment))
         elapsed_time = time.time() - start_time
         return segments_result, elapsed_time