File size: 3,970 Bytes
bb506f1
 
 
ec608d5
 
 
 
bb506f1
ec608d5
bb506f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ec608d5
 
 
 
 
 
 
 
bb506f1
ec608d5
 
 
 
 
 
 
 
bb506f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ec608d5
 
 
 
 
 
 
 
bb506f1
 
 
bebe125
 
 
074ec30
bebe125
ec608d5
bb506f1
ec608d5
bb506f1
bfe995c
bebe125
bb506f1
42a00dc
 
ec608d5
 
bb506f1
ec608d5
 
 
 
 
bebe125
ec608d5
 
bb506f1
 
ec608d5
bb506f1
 
ec608d5
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
"""
Imports
"""
from transformers import pipeline
from pytube import YouTube
import gradio as gr
import requests
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

"""
Pipeline and models
"""
transcribe_pipe = pipeline(model="Silemo/whisper-it")  # change to "your-username/the-name-you-picked"

tags_model = AutoModelForSeq2SeqLM.from_pretrained("efederici/text2tags")
tags_tokenizer = AutoTokenizer.from_pretrained("efederici/text2tags")

"""
Methods
"""
def transcribe(audio):
    text = transcribe_pipe(audio)["text"]
    return text

def transcribe_video(url):
    yt = YouTube(url)
    stream = yt.streams.get_audio_only()

    # Saves the audio in the /audio folder
    audio = stream.download() #output_path = "audio/"

    text = transcribe_and_tag(audio)

    return text

def transcribe_and_tag(audio):
    text = transcribe(audio)
    tags = tag(text=text)
    return text, tags

def download_audio(audio_url, filename):

    # URL of the image to be downloaded is defined as audio_url
    r = requests.get(audio_url) # create HTTP response object 
  
    # send a HTTP request to the server and save 
    # the HTTP response in a response object called r 
    with open(filename,'wb') as f: #"audio/" + 
  
        # Saving received content as a mp3 file in 
        # binary format 
  
        # write the contents of the response (r.content) 
        # to a new file in binary mode. 
        f.write(r.content) 

def tag(text: str):
    """ 
    Generates tags from given text 
    """
    text = text.strip().replace('\n', '')
    text = 'summarize: ' + text
    tokenized_text = tags_tokenizer.encode(text, return_tensors="pt")

    tags_ids = tags_model.generate(tokenized_text,
                                        num_beams=4,
                                        no_repeat_ngram_size=2,
                                        max_length=20,
                                        early_stopping=True)

    output = tags_tokenizer.decode(tags_ids[0], skip_special_tokens=True)
    return output.split(', ')

"""
Downloading audio files
"""
audio1_url = "https://github.com/Silemo/sml-lab2-2023-manfredi-meneghin/raw/main/task1/audio/offer.mp3"
audio1_filename = "offer.mp3"
download_audio(audio1_url, audio1_filename)

audio2_url = "https://github.com/Silemo/sml-lab2-2023-manfredi-meneghin/raw/main/task1/audio/fantozzi.mp3"
audio2_filename = "fantozzi.mp3"
download_audio(audio2_url, audio2_filename)

"""
Interfaces
"""
audio_transcription = gr.Textbox(label="Transcription")
audio_tags = gr.Textbox(label="Tags")

yt_transcription = gr.Textbox(label="Transcription")
yt_tags = gr.Textbox(label="Tags")

# Multiple interfaces using tabs -> https://github.com/gradio-app/gradio/issues/450
io1 = gr.Interface(
    fn = transcribe_and_tag,
    inputs = gr.Audio(sources=["upload", "microphone"], type="filepath"),
    outputs = [audio_transcription, audio_tags],
    examples = [
        [audio1_filename],
        [audio2_filename],
    ],
    title = "Whisper Small - Italian - Microphone or Audio file",
    description = "Realtime demo for Italian speech recognition using a fine-tuned Whisper small model. It uses the computer microphone as audio input. It outputs a transcription and the tags of the text",
)

io2 = gr.Interface(
    fn = transcribe_video,
    inputs = gr.Textbox(label = "YouTube URL", placeholder = "https://youtu.be/9DImRZERJNs?si=1Lme7o_KH2oCxU7y"),
    outputs=[yt_transcription, yt_tags],

    examples=[
        # Meloni - Confindustria
        ["https://www.youtube.com/watch?v=qMslwA7RCcc"],
        
        # Montemagno - Ripartire da zero
        ["https://www.youtube.com/watch?v=WlT3dCAGjRo"],
    ],
    
    title = "Whisper Small - Italian - YouTube link",
    description = "Realtime demo for Italian speech recognition using a fine-tuned Whisper small model. It uses a YouTube link as audio input",
)

gr.TabbedInterface(
    [io1, io2], {"Microphone or audio file", "YouTube"}
).launch()