File size: 4,299 Bytes
19befe8
5c66990
fdad218
47d9326
9de2290
 
 
47d9326
7a97be1
47d9326
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fdad218
47d9326
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fdad218
47d9326
7a97be1
 
47d9326
 
 
 
fdad218
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47d9326
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fdad218
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47d9326
 
 
fdad218
 
47d9326
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import os
os.system("pip install git+https://github.com/openai/whisper.git")
from pytube import YouTube
import gradio as gr
from subprocess import call
import whisper
import logging
# from transformers.pipelines.audio_utils import ffmpeg_read


logger = logging.getLogger("whisper-jax-app")
logger.setLevel(logging.INFO)
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
formatter = logging.Formatter(
    "%(asctime)s;%(levelname)s;%(message)s", "%Y-%m-%d %H:%M:%S")
ch.setFormatter(formatter)
logger.addHandler(ch)


BATCH_SIZE = 16
CHUNK_LENGTH_S = 30
NUM_PROC = 8
FILE_LIMIT_MB = 1000
YT_ATTEMPT_LIMIT = 3


def run_cmd(command):
    try:
        print(command)
        call(command)
    except KeyboardInterrupt:
        print("Process interrupted")
        sys.exit(1)


def inference(text):
    cmd = ['tts', '--text', text]
    run_cmd(cmd)
    return 'tts_output.wav'


model = whisper.load_model("base")

inputs = gr.components.Audio(type="filepath", label="Add audio file")
outputs = gr.components.Textbox()
title = "Transcribe multi-lingual audio clips"
description = "An example of using TTS to generate speech from text."
article = ""
examples = [
    [""]
]


def transcribe(inputs):
    print('Inputs: ', inputs)
    # print('Text: ', text)
    # progress(0, desc="Loading audio file...")
    if inputs is None:
        logger.warning("No audio file")
        return "No audio file submitted! Please upload an audio file before submitting your request."
    file_size_mb = os.stat(inputs).st_size / (1024 * 1024)
    if file_size_mb > FILE_LIMIT_MB:
        logger.warning("Max file size exceeded")
        return f"File size exceeds file size limit. Got file of size {file_size_mb:.2f}MB for a limit of {FILE_LIMIT_MB}MB."

    # with open(inputs, "rb") as f:
    #     inputs = f.read()

    # load audio and pad/trim it to fit 30 seconds
    result = model.transcribe(audio=inputs, language='english',
                              word_timestamps=False, verbose=True)
#  ---------------------------------------------------

    print(result["text"])
    return result["text"]


# Transcribe youtube video
# define function for transcription
def youtube_transcript(url):
    try:
        if url:
            yt = YouTube(url, use_oauth=True)
            source = yt.streams.filter(progressive=True, file_extension='mp4').order_by(
                'resolution').desc().first().download('output/youtube')

            transcript = model.transcribe(source)
            return transcript["text"]
    except Exception as e:
        print('Error: ', e)
        return 'Error: ' + str(e)


audio_chunked = gr.Interface(
    fn=transcribe,
    inputs=inputs,
    outputs=outputs,
    allow_flagging="never",
    title=title,
    description=description,
    article=article,
)

microphone_chunked = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.inputs.Audio(source="microphone",
                        optional=True, type="filepath"),
    ],
    outputs=[
        gr.outputs.Textbox(label="Transcription").style(
            show_copy_button=True),
    ],
    allow_flagging="never",
    title=title,
    description=description,
    article=article,
)
youtube_chunked = gr.Interface(
    fn=youtube_transcript,
    inputs=[
        gr.inputs.Textbox(label="Youtube URL", type="text"),
    ],
    outputs=[
        gr.outputs.Textbox(label="Transcription").style(
            show_copy_button=True),
    ],
    allow_flagging="never",
    title=title,

    description=description,
    article=article,
    examples=[
        [  "https://www.youtube.com/watch?v=nlMuHtV82q8&ab_channel=NothingforSale24",],
        ["https://www.youtube.com/watch?v=JzPfMbG1vrE&ab_channel=ExplainerVideosByLauren",],
        ["https://www.youtube.com/watch?v=S68vvV0kod8&ab_channel=Pearl-CohnTelevision"]

    ],

)

demo = gr.Blocks()
with demo:
    gr.TabbedInterface([youtube_chunked, audio_chunked, microphone_chunked], [
        "Youtube", "Audio File", "Microphone"])
demo.queue(concurrency_count=1, max_size=5)
demo.launch(show_api=False)


# gr.Interface(
#     inference,
#     inputs,
#     outputs,
#     verbose=True,
#     title=title,
#     description=description,
#     article=article,
#     examples=examples,
#     enable_queue=True,

# ).launch(share=True, debug=True)