Spaces:
Runtime error
Runtime error
File size: 3,851 Bytes
bb506f1 ec608d5 bb506f1 ec608d5 bb506f1 ec608d5 bb506f1 ec608d5 bb506f1 ec608d5 bb506f1 ec608d5 bb506f1 ec608d5 bb506f1 bfe995c bb506f1 42a00dc ec608d5 bb506f1 ec608d5 bb506f1 ec608d5 bb506f1 ec608d5 bb506f1 ec608d5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
"""
Imports
"""
from transformers import pipeline
from pytube import YouTube
import gradio as gr
import requests
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
"""
Pipeline and models
"""
transcribe_pipe = pipeline(model="Silemo/whisper-it") # change to "your-username/the-name-you-picked"
tags_model = AutoModelForSeq2SeqLM.from_pretrained("efederici/text2tags")
tags_tokenizer = AutoTokenizer.from_pretrained("efederici/text2tags")
"""
Methods
"""
def transcribe(audio):
text = transcribe_pipe(audio)["text"]
return text
def transcribe_video(url):
yt = YouTube(url)
stream = yt.streams.get_audio_only()
# Saves the audio in the /audio folder
audio = stream.download() #output_path = "audio/"
text = transcribe_and_tag(audio)
return text
def transcribe_and_tag(audio):
text = transcribe(audio)
tags = tag(text=text)
return text, tags
def download_audio(audio_url, filename):
# URL of the image to be downloaded is defined as audio_url
r = requests.get(audio_url) # create HTTP response object
# send a HTTP request to the server and save
# the HTTP response in a response object called r
with open(filename,'wb') as f: #"audio/" +
# Saving received content as a mp3 file in
# binary format
# write the contents of the response (r.content)
# to a new file in binary mode.
f.write(r.content)
def tag(text: str):
"""
Generates tags from given text
"""
text = text.strip().replace('\n', '')
text = 'summarize: ' + text
tokenized_text = tags_tokenizer.encode(text, return_tensors="pt")
tags_ids = tags_model.generate(tokenized_text,
num_beams=4,
no_repeat_ngram_size=2,
max_length=20,
early_stopping=True)
output = tags_tokenizer.decode(tags_ids[0], skip_special_tokens=True)
return output.split(', ')
"""
Downloading audio files
"""
audio1_url = "https://github.com/Silemo/sml-lab2-2023-manfredi-meneghin/raw/main/task1/audio/offer.mp3"
audio1_filename = "offer.mp3"
download_audio(audio1_url, audio1_filename)
audio2_url = "https://github.com/Silemo/sml-lab2-2023-manfredi-meneghin/raw/main/task1/audio/fantozzi.mp3"
audio2_filename = "fantozzi.mp3"
download_audio(audio2_url, audio2_filename)
"""
Interfaces
"""
transcription = gr.Textbox(label="Transcription")
tags = gr.Textbox(label="Tags")
# Multiple interfaces using tabs -> https://github.com/gradio-app/gradio/issues/450
io1 = gr.Interface(
fn = transcribe_and_tag,
inputs = gr.Audio(sources=["upload", "microphone"], type="filepath"),
outputs = [transcription, tags],
examples = [
[audio1_filename],
[audio2_filename],
],
title = "Whisper Small - Italian - Microphone or Audio file",
description = "Realtime demo for Italian speech recognition using a fine-tuned Whisper small model. It uses the computer microphone as audio input. It outputs a transcription and the tags of the text",
)
io2 = gr.Interface(
fn = transcribe_video,
inputs = gr.Textbox(label = "YouTube URL", placeholder = "https://youtu.be/9DImRZERJNs?si=1Lme7o_KH2oCxU7y"),
outputs=[transcription, tags],
examples=[
# Meloni - Confindustria
["https://www.youtube.com/watch?v=qMslwA7RCcc"],
# Montemagno - Ripartire da zero
["https://www.youtube.com/watch?v=WlT3dCAGjRo"],
],
title = "Whisper Small - Italian - YouTube link",
description = "Realtime demo for Italian speech recognition using a fine-tuned Whisper small model. It uses a YouTube link as audio input",
)
gr.TabbedInterface(
[io1, io2], {"Microphone or audio file", "YouTube"}
).launch() |