File size: 5,708 Bytes
bbe4238
884c49d
0c8b1e1
 
bd8e31e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0c8b1e1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bbe4238
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0c8b1e1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import gradio as gr

from whisperplus.utils.download_utils import download_and_convert_to_mp3

import logging

import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


class SpeechToTextPipeline:
    """Class for converting audio to text using a pre-trained speech recognition model."""

    def __init__(self, model_id: str = "openai/whisper-large-v3"):
        self.model = None
        self.device = None

        if self.model is None:
            self.load_model(model_id)
        else:
            logging.info("Model already loaded.")

    def load_model(self, model_id: str = "openai/whisper-large-v3"):
        """
        Loads the pre-trained speech recognition model and moves it to the specified device.

        Args:
            model_id (str): Identifier of the pre-trained model to be loaded.
        """
        logging.info("Loading model...")
        model = AutoModelForSpeechSeq2Seq.from_pretrained(
            model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True, use_safetensors=True)
        model.to(self.device)
        logging.info("Model loaded successfully.")

        self.model = model

    def __call__(self, audio_path: str, model_id: str = "openai/whisper-large-v3", language: str = "turkish"):
        """
        Converts audio to text using the pre-trained speech recognition model.

        Args:
            audio_path (str): Path to the audio file to be transcribed.
            model_id (str): Identifier of the pre-trained model to be used for transcription.

        Returns:
            str: Transcribed text from the audio.
        """
        processor = AutoProcessor.from_pretrained(model_id)
        pipe = pipeline(
            "automatic-speech-recognition",
            model=self.model,
            torch_dtype=torch.float16,
            chunk_length_s=30,
            max_new_tokens=128,
            batch_size=24,
            return_timestamps=True,
            device="cuda",
            tokenizer=processor.tokenizer,
            feature_extractor=processor.feature_extractor,
            model_kwargs={"use_flash_attention_2": True},
            generate_kwargs={"language": language},
        )
        logging.info("Transcribing audio...")
        result = pipe(audio_path)["text"]
        return result

def youtube_url_to_text(url, model_id, language_choice):
    """
    Main function that downloads and converts a video to MP3 format, performs speech-to-text conversion using
    a specified model, and returns the transcript along with the video path.

    Args:
        url (str): The URL of the video to download and convert.
        model_id (str): The ID of the speech-to-text model to use.
        language_choice (str): The language choice for the speech-to-text conversion.

    Returns:
        transcript (str): The transcript of the speech-to-text conversion.
        video_path (str): The path of the downloaded video.
    """
    video_path = download_and_convert_to_mp3(url)
    pipeline = SpeechToTextPipeline(model_id)
    transcript = pipeline(audio_path=video_path, model_id=model_id, language=language_choice)

    return transcript, video_path


def youtube_url_to_text_app():
    with gr.Blocks():
        with gr.Row():
            with gr.Column():
                youtube_url_path = gr.Text(placeholder="Enter Youtube URL", label="Youtube URL")

                language_choice = gr.Dropdown(
                    choices=[
                        "English",
                        "Turkish",
                        "Spanish",
                        "French",
                        "Chinese",
                        "Japanese",
                        "Korean",
                    ],
                    value="Turkish",
                    label="Language",
                )
                whisper_model_id = gr.Dropdown(
                    choices=[
                        "openai/whisper-large-v3",
                        "openai/whisper-large",
                        "openai/whisper-medium",
                        "openai/whisper-base",
                        "openai/whisper-small",
                        "openai/whisper-tiny",
                    ],
                    value="openai/whisper-large-v3",
                    label="Whisper Model",
                )
                whisperplus_in_predict = gr.Button(value="Generator")

            with gr.Column():
                output_text = gr.Textbox(label="Output Text")
                output_audio = gr.Audio(label="Output Audio")

        whisperplus_in_predict.click(
            fn=youtube_url_to_text,
            inputs=[
                youtube_url_path,
                whisper_model_id,
                language_choice,
            ],
            outputs=[output_text, output_audio],
        )


gradio_app = gr.Blocks()
with gradio_app:
    gr.HTML(
        """
    <h1 style='text-align: center'>
    WhisperPlus: Advancing Speech-to-Text Processing 🚀
    </h1>
    """)
    gr.HTML(
        """
        <h3 style='text-align: center'>
        Follow me for more!
        <a href='https://twitter.com/kadirnar_ai' target='_blank'>Twitter</a> | <a href='https://github.com/kadirnar' target='_blank'>Github</a> | <a href='https://www.linkedin.com/in/kadir-nar/' target='_blank'>Linkedin</a>  | <a href='https://www.huggingface.co/kadirnar/' target='_blank'>HuggingFace</a>
        </h3>
        """)
    with gr.Row():
        with gr.Column():
            with gr.Tab(label="Youtube URL to Text"):
                youtube_url_to_text_app()

gradio_app.queue()
gradio_app.launch(debug=True)