File size: 4,261 Bytes
69dba80
07ebfb1
 
 
e1a5899
07ebfb1
5b11f8b
5aa1892
07ebfb1
311ebef
 
 
 
 
 
99d9b3e
07ebfb1
446a864
 
07ebfb1
 
 
6351056
ad6cbd0
07ebfb1
1c4706b
769dbd6
84d6345
983c638
1c4706b
983c638
 
 
35af703
311ebef
983c638
602514f
 
 
 
 
35af703
07ebfb1
6a3ae5e
 
 
 
 
 
 
 
 
 
 
 
c1541fb
07ebfb1
 
 
 
 
 
 
 
 
e1a5899
f4d4476
99d9b3e
 
 
 
 
 
 
 
 
 
 
 
 
b9fdb45
e1a5899
07ebfb1
f4d4476
07ebfb1
 
 
 
 
 
e1a5899
 
07ebfb1
 
e1a5899
07ebfb1
 
 
 
 
 
 
 
99d9b3e
07ebfb1
6a3ae5e
e1a5899
 
6a3ae5e
e1a5899
07ebfb1
 
 
 
e1a5899
07ebfb1
 
 
 
 
 
99d9b3e
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import time
import os

import torch

import gradio as gr
import spaces
from transformers import AutoFeatureExtractor, AutoTokenizer, WhisperForConditionalGeneration, WhisperProcessor, pipeline
from huggingface_hub import model_info
try:
    import flash_attn
    FLASH_ATTENTION = True
except ImportError:
    FLASH_ATTENTION = False

import yt_dlp  # Added import for yt-dlp

MODEL_NAME = "NbAiLab/nb-whisper-large"
lang = "no"

share = (os.environ.get("SHARE", "False")[0].lower() in "ty1") or None
auth_token = os.environ.get("AUTH_TOKEN") or True
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

@spaces.GPU(duration=60 * 2)
def pipe(file, return_timestamps=False):
    asr = pipeline(
        task="automatic-speech-recognition",
        model=MODEL_NAME,
        chunk_length_s=30,
        device=device,
        token=auth_token,
        torch_dtype=torch.float16,
        model_kwargs={"attn_implementation": "flash_attention_2"} if FLASH_ATTENTION else {"attn_implementation": "sdpa"},
    )
    asr.model.config.forced_decoder_ids = asr.tokenizer.get_decoder_prompt_ids(
        language=lang,
        task="transcribe",
        no_timestamps=not return_timestamps,
    )
    return asr(file, return_timestamps=return_timestamps, batch_size=24)

def transcribe(file, return_timestamps=False):
    if not return_timestamps:
        text = pipe(file)["text"]
    else:
        chunks = pipe(file, return_timestamps=True)["chunks"]
        text = []
        for chunk in chunks:
            start_time = time.strftime('%H:%M:%S', time.gmtime(chunk["timestamp"][0])) if chunk["timestamp"][0] is not None else "??:??:??"
            end_time = time.strftime('%H:%M:%S', time.gmtime(chunk["timestamp"][1])) if chunk["timestamp"][1] is not None else "??:??:??"
            line = f"[{start_time} -> {end_time}] {chunk['text']}"
            text.append(line)
        text = "\n".join(text)
    return text

def _return_yt_html_embed(yt_url):
    video_id = yt_url.split("?v=")[-1]
    HTML_str = (
        f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
        " </center>"
    )
    return HTML_str

def yt_transcribe(yt_url, return_timestamps=False):
    html_embed_str = _return_yt_html_embed(yt_url)

    ydl_opts = {
        'format': 'bestaudio/best',
        'outtmpl': 'audio.%(ext)s',
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'mp3',
            'preferredquality': '192',
        }],
        'quiet': True,
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([yt_url])

    text = transcribe("audio.mp3", return_timestamps=return_timestamps)

    return html_embed_str, text

demo = gr.Blocks()

mf_transcribe = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.components.Audio(sources=['upload', 'microphone'], type="filepath"),
        gr.components.Checkbox(label="Return timestamps"),
    ],
    outputs="text",
    title="NB-Whisper",
    description=(
        "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the the fine-tuned"
        f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
        " of arbitrary length."
    ),
    allow_flagging="never",
)

yt_transcribe_interface = gr.Interface(
    fn=yt_transcribe,
    inputs=[
        gr.components.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
        gr.components.Checkbox(label="Return timestamps"),
    ],
    examples=[["https://www.youtube.com/watch?v=mukeSSa5GKo"]],
    outputs=["html", "text"],
    title="Whisper Demo: Transcribe YouTube",
    description=(
        "Transcribe long-form YouTube videos with the click of a button! Demo uses the the fine-tuned checkpoint:"
        f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files of"
        " arbitrary length."
    ),
    allow_flagging="never",
)

with demo:
    gr.TabbedInterface(
        [mf_transcribe, yt_transcribe_interface],
        ["Transcribe Audio", "Transcribe YouTube"]
    )

demo.launch(share=share).queue()