Spaces:
Sleeping
Sleeping
File size: 5,915 Bytes
4dbcfe1 f475b61 4dbcfe1 2986f68 a24b265 72014e7 4dbcfe1 d2573d2 4dbcfe1 a24b265 4dbcfe1 2986f68 0eac81e 2986f68 4dbcfe1 d2573d2 4dbcfe1 f475b61 4dbcfe1 72014e7 4dbcfe1 72014e7 d2573d2 4dbcfe1 72014e7 4dbcfe1 f475b61 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 |
import gradio as gr
import moviepy.editor as mp
from deep_translator import GoogleTranslator
from pydub import AudioSegment
import os
import tempfile
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import spaces
import pytube
import librosa
# Add the LANGUAGES dictionary
LANGUAGES = {
"en": "eng",
"zh": "zho",
"de": "deu",
"es": "spa",
"ru": "rus",
"ko": "kor",
"fr": "fra",
"ja": "jpn",
"pt": "por",
"tr": "tur",
"pl": "pol",
"ca": "cat",
"nl": "nld",
"ar": "ara",
"sv": "swe",
"it": "ita",
"id": "ind",
"hi": "hin",
"fi": "fin",
"vi": "vie",
"iw": "heb",
"uk": "ukr",
"el": "ell",
"ms": "msa",
"cs": "ces",
"ro": "ron",
"da": "dan",
"hu": "hun",
"ta": "tam",
"no": "nor",
"th": "tha",
"ur": "urd",
"hr": "hrv",
"bg": "bul",
"lt": "lit",
"la": "lat",
"mi": "mri",
"ml": "mal",
"cy": "cym",
"sk": "slk",
"te": "tel",
"fa": "fas",
"lv": "lav",
"bn": "ben",
"sr": "srp",
"az": "aze",
"sl": "slv",
"kn": "kan",
"et": "est",
"mk": "mkd",
"br": "bre",
"eu": "eus",
"is": "isl",
"hy": "hye",
"ne": "nep",
"mn": "mon",
"bs": "bos",
"kk": "kaz",
"sq": "sqi",
"sw": "swa",
"gl": "glg",
"mr": "mar",
"pa": "pan",
"si": "sin",
"km": "khm",
"sn": "sna",
"yo": "yor",
"so": "som",
"af": "afr",
"oc": "oci",
"ka": "kat",
"be": "bel",
"tg": "tgk",
"sd": "snd",
"gu": "guj",
"am": "amh",
"yi": "yid",
"lo": "lao",
"uz": "uzb",
"fo": "fao",
"ht": "hat",
"ps": "pus",
"tk": "tuk",
"nn": "nno",
"mt": "mlt",
"sa": "san",
"lb": "ltz",
"my": "mya",
"bo": "bod",
"tl": "tgl",
"mg": "mlg",
"as": "asm",
"tt": "tat",
"haw": "haw",
"ln": "lin",
"ha": "hau",
"ba": "bak",
"jw": "jav",
"su": "sun",
}
def extract_audio(video_path):
video = mp.VideoFileClip(video_path)
audio = video.audio
audio_path = tempfile.mktemp(suffix=".wav")
audio.write_audiofile(audio_path)
return audio_path
@spaces.GPU
def generate_subtitles(audio_path):
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3").to(device)
# Load and preprocess the audio
audio_input, _ = librosa.load(audio_path, sr=16000)
input_features = processor(audio_input, sampling_rate=16000, return_tensors="pt").input_features.to(device)
# Generate token ids
predicted_ids = model.generate(input_features)
# Decode token ids to text
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
# For simplicity, we're returning a single segment with the full transcription
# In a more advanced implementation, you might want to split this into multiple segments
return [{"start": 0, "end": len(audio_input) / 16000, "text": transcription[0]}]
def translate_subtitles(subtitles, target_language):
# Use the LANGUAGES dictionary to get the full language name
target_lang_code = next((k for k, v in LANGUAGES.items() if v == target_language), target_language)
translator = GoogleTranslator(source='auto', target=target_lang_code)
translated_subtitles = []
for segment in subtitles:
translated_text = translator.translate(segment["text"])
translated_subtitles.append({
"start": segment["start"],
"end": segment["end"],
"text": translated_text
})
return translated_subtitles
def add_subtitles_to_video(video_path, subtitles, output_path):
video = mp.VideoFileClip(video_path)
subtitles_clips = [
mp.TextClip(txt=subtitle["text"], fontsize=24, color='white', bg_color='black', font='Arial')
.set_position(('center', 'bottom'))
.set_duration(subtitle["end"] - subtitle["start"])
.set_start(subtitle["start"])
for subtitle in subtitles
]
final_video = mp.CompositeVideoClip([video] + subtitles_clips)
final_video.write_videofile(output_path, codec="libx264", audio_codec="aac")
def process_video(video_path, target_language):
# Extract audio from video
audio_path = extract_audio(video_path)
# Generate subtitles using Whisper
subtitles = generate_subtitles(audio_path)
# Translate subtitles
translated_subtitles = translate_subtitles(subtitles, target_language)
# Add translated subtitles to video
output_path = tempfile.mktemp(suffix=".mp4")
add_subtitles_to_video(video_path, translated_subtitles, output_path)
return output_path
def download_youtube_video(youtube_link):
yt = pytube.YouTube(youtube_link)
video = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first()
video_path = video.download(output_path=tempfile.gettempdir())
return video_path
def gradio_interface(video, yt_link, target_language):
if video is not None:
video_path = video.name
elif yt_link:
video_path = download_youtube_video(yt_link)
else:
raise ValueError("Please provide either a video file or a YouTube link.")
output_video = process_video(video_path, target_language)
return output_video
iface = gr.Interface(
fn=gradio_interface,
inputs=[
gr.Video(label="Upload Video"),
gr.Textbox(label="YouTube Link"),
gr.Dropdown(choices=list(LANGUAGES.values()), label="Target Language")
],
outputs=gr.Video(label="Processed Video"),
title="Video Subtitle Translator",
description="Upload a video or provide a YouTube link, and get it back with translated subtitles!"
)
iface.launch() |