File size: 5,915 Bytes
4dbcfe1
 
f475b61
4dbcfe1
 
 
2986f68
 
a24b265
72014e7
 
4dbcfe1
d2573d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4dbcfe1
 
 
 
 
 
 
a24b265
4dbcfe1
2986f68
0eac81e
 
2986f68
 
 
 
 
 
 
 
 
 
 
 
 
 
4dbcfe1
 
d2573d2
 
 
4dbcfe1
 
f475b61
4dbcfe1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72014e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4dbcfe1
 
 
 
 
 
72014e7
d2573d2
4dbcfe1
 
 
72014e7
4dbcfe1
 
f475b61
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
import gradio as gr
import moviepy.editor as mp
from deep_translator import GoogleTranslator
from pydub import AudioSegment
import os
import tempfile
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import spaces
import pytube
import librosa

# Add the LANGUAGES dictionary
LANGUAGES = {
    "en": "eng",
    "zh": "zho",
    "de": "deu",
    "es": "spa",
    "ru": "rus",
    "ko": "kor",
    "fr": "fra",
    "ja": "jpn",
    "pt": "por",
    "tr": "tur",
    "pl": "pol",
    "ca": "cat",
    "nl": "nld",
    "ar": "ara",
    "sv": "swe",
    "it": "ita",
    "id": "ind",
    "hi": "hin",
    "fi": "fin",
    "vi": "vie",
    "iw": "heb",
    "uk": "ukr",
    "el": "ell",
    "ms": "msa",
    "cs": "ces",
    "ro": "ron",
    "da": "dan",
    "hu": "hun",
    "ta": "tam",
    "no": "nor",
    "th": "tha",
    "ur": "urd",
    "hr": "hrv",
    "bg": "bul",
    "lt": "lit",
    "la": "lat",
    "mi": "mri",
    "ml": "mal",
    "cy": "cym",
    "sk": "slk",
    "te": "tel",
    "fa": "fas",
    "lv": "lav",
    "bn": "ben",
    "sr": "srp",
    "az": "aze",
    "sl": "slv",
    "kn": "kan",
    "et": "est",
    "mk": "mkd",
    "br": "bre",
    "eu": "eus",
    "is": "isl",
    "hy": "hye",
    "ne": "nep",
    "mn": "mon",
    "bs": "bos",
    "kk": "kaz",
    "sq": "sqi",
    "sw": "swa",
    "gl": "glg",
    "mr": "mar",
    "pa": "pan",
    "si": "sin",
    "km": "khm",
    "sn": "sna",
    "yo": "yor",
    "so": "som",
    "af": "afr",
    "oc": "oci",
    "ka": "kat",
    "be": "bel",
    "tg": "tgk",
    "sd": "snd",
    "gu": "guj",
    "am": "amh",
    "yi": "yid",
    "lo": "lao",
    "uz": "uzb",
    "fo": "fao",
    "ht": "hat",
    "ps": "pus",
    "tk": "tuk",
    "nn": "nno",
    "mt": "mlt",
    "sa": "san",
    "lb": "ltz",
    "my": "mya",
    "bo": "bod",
    "tl": "tgl",
    "mg": "mlg",
    "as": "asm",
    "tt": "tat",
    "haw": "haw",
    "ln": "lin",
    "ha": "hau",
    "ba": "bak",
    "jw": "jav",
    "su": "sun",
}

def extract_audio(video_path):
    video = mp.VideoFileClip(video_path)
    audio = video.audio
    audio_path = tempfile.mktemp(suffix=".wav")
    audio.write_audiofile(audio_path)
    return audio_path

@spaces.GPU
def generate_subtitles(audio_path):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3")
    model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3").to(device)

    # Load and preprocess the audio
    audio_input, _ = librosa.load(audio_path, sr=16000)
    input_features = processor(audio_input, sampling_rate=16000, return_tensors="pt").input_features.to(device)

    # Generate token ids
    predicted_ids = model.generate(input_features)

    # Decode token ids to text
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)

    # For simplicity, we're returning a single segment with the full transcription
    # In a more advanced implementation, you might want to split this into multiple segments
    return [{"start": 0, "end": len(audio_input) / 16000, "text": transcription[0]}]

def translate_subtitles(subtitles, target_language):
    # Use the LANGUAGES dictionary to get the full language name
    target_lang_code = next((k for k, v in LANGUAGES.items() if v == target_language), target_language)
    translator = GoogleTranslator(source='auto', target=target_lang_code)
    translated_subtitles = []
    for segment in subtitles:
        translated_text = translator.translate(segment["text"])
        translated_subtitles.append({
            "start": segment["start"],
            "end": segment["end"],
            "text": translated_text
        })
    return translated_subtitles

def add_subtitles_to_video(video_path, subtitles, output_path):
    video = mp.VideoFileClip(video_path)
    subtitles_clips = [
        mp.TextClip(txt=subtitle["text"], fontsize=24, color='white', bg_color='black', font='Arial')
        .set_position(('center', 'bottom'))
        .set_duration(subtitle["end"] - subtitle["start"])
        .set_start(subtitle["start"])
        for subtitle in subtitles
    ]
    final_video = mp.CompositeVideoClip([video] + subtitles_clips)
    final_video.write_videofile(output_path, codec="libx264", audio_codec="aac")

def process_video(video_path, target_language):
    # Extract audio from video
    audio_path = extract_audio(video_path)

    # Generate subtitles using Whisper
    subtitles = generate_subtitles(audio_path)

    # Translate subtitles
    translated_subtitles = translate_subtitles(subtitles, target_language)

    # Add translated subtitles to video
    output_path = tempfile.mktemp(suffix=".mp4")
    add_subtitles_to_video(video_path, translated_subtitles, output_path)

    return output_path

def download_youtube_video(youtube_link):
    yt = pytube.YouTube(youtube_link)
    video = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first()
    video_path = video.download(output_path=tempfile.gettempdir())
    return video_path

def gradio_interface(video, yt_link, target_language):
    if video is not None:
        video_path = video.name
    elif yt_link:
        video_path = download_youtube_video(yt_link)
    else:
        raise ValueError("Please provide either a video file or a YouTube link.")
    
    output_video = process_video(video_path, target_language)
    return output_video

iface = gr.Interface(
    fn=gradio_interface,
    inputs=[
        gr.Video(label="Upload Video"),
        gr.Textbox(label="YouTube Link"),
        gr.Dropdown(choices=list(LANGUAGES.values()), label="Target Language")
    ],
    outputs=gr.Video(label="Processed Video"),
    title="Video Subtitle Translator",
    description="Upload a video or provide a YouTube link, and get it back with translated subtitles!"
)

iface.launch()