File size: 2,718 Bytes
ec608d5
 
 
 
 
 
 
 
 
 
 
 
 
 
42a00dc
ec608d5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42a00dc
ec608d5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f83c531
ec608d5
 
 
42a00dc
 
ec608d5
 
 
42a00dc
ec608d5
 
 
 
 
 
 
 
 
 
 
 
42a00dc
 
 
 
ec608d5
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
from transformers import pipeline
from pytube import YouTube
import gradio as gr
import requests

pipe = pipeline(model="Silemo/whisper-it")  # change to "your-username/the-name-you-picked"

def download_audio(audio_url, filename):

    # URL of the image to be downloaded is defined as audio_url
    r = requests.get(audio_url) # create HTTP response object 
  
    # send a HTTP request to the server and save 
    # the HTTP response in a response object called r 
    with open(filename,'wb') as f:
  
        # Saving received content as a mp3 file in 
        # binary format 
  
        # write the contents of the response (r.content) 
        # to a new file in binary mode. 
        f.write(r.content) 

def transcribe(audio):
    text = pipe(audio)["text"]
    return text

def transcribe_video(url):
    yt = YouTube(url)
    stream = yt.streams.get_audio_only()

    # Saves the audio in the /audio folder
    audio = stream.download()

    text = transcribe(audio)

    return text

audio1_url = "https://github.com/Silemo/sml-lab2-2023-manfredi-meneghin/raw/main/task1/audio/offer.mp3"
audio1_filename = "offer.mp3"
download_audio(audio1_url, audio1_filename)

audio2_url = "https://github.com/Silemo/sml-lab2-2023-manfredi-meneghin/raw/main/task1/audio/fantozzi.mp3"
audio2_filename = "fantozzi.mp3"
download_audio(audio2_url, audio2_filename)

# Multiple interfaces using tabs -> https://github.com/gradio-app/gradio/issues/450

io1 = gr.Interface(
    fn = transcribe,
    inputs = gr.Audio(sources=["microphone", "upload"], type="filepath"),
    outputs = "text",

    examples=[
        [audio1_filename],
        [audio2_filename],
    ],

    title = "Whisper Small - Italian - Microphone or Audio file",
    description = "Realtime demo for Italian speech recognition using a fine-tuned Whisper small model. It uses the computer microphone or an audio file as audio input",
)

io2 = gr.Interface(
    fn = transcribe_video,
    inputs = gr.Textbox(label = "YouTube URL", placeholder = "https://youtu.be/9DImRZERJNs?si=1Lme7o_KH2oCxU7y"),
    outputs = "text",

    examples=[
        # Per me è la cipolla
        ["https://youtu.be/QbwZlURClSA?si=DKMtIiKE-nO2mfcV"],
        
        # Breaking Italy - Lollobrigida ferma il treno
        ["https://youtu.be/9MPBN0tnA_E?si=8-hqkJS05LNkWprX&t=2"],

        # Mussolini discorso
        ["https://youtu.be/UmnxcjRk37Q?si=uxt8oqnMDJ3vFzIB&t=77"],
    ],
    
    title = "Whisper Small - Italian - YouTube link",
    description = "Realtime demo for Italian speech recognition using a fine-tuned Whisper small model. It uses a YouTube link as audio input",
)

gr.TabbedInterface(
    [io1, io2], {"Microphone or audio file", "YouTube"}
).launch()