import streamlit as st from moviepy.editor import VideoFileClip import whisper from translate import Translator from gtts import gTTS import tempfile import os import numpy as np # Initialize Whisper model try: whisper_model = whisper.load_model("base") except Exception as e: st.error(f"Error loading Whisper model: {e}") # Language options LANGUAGES = { 'English': 'en', 'Tamil': 'ta', 'Sinhala': 'si', 'French': 'fr', # Add more languages as needed } st.title("AI Video Translator with Whisper and GTTS") # Step 1: Upload video file video_file = st.file_uploader("Upload a video file", type=["mp4", "mov", "avi", "mkv"]) if video_file: # Step 2: Select translation language target_language = st.selectbox("Select the target language for translation", list(LANGUAGES.keys())) # Process when user clicks translate if st.button("Translate Video"): # Save video to a temporary file with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as temp_video: temp_video.write(video_file.read()) temp_video_path = temp_video.name # Extract audio from video try: video = VideoFileClip(temp_video_path) audio_path = tempfile.mktemp(suffix=".wav") video.audio.write_audiofile(audio_path) except Exception as e: st.error(f"Error extracting audio from video: {e}") os.remove(temp_video_path) st.stop() # Function to transcribe audio in chunks def transcribe_audio_in_chunks(audio_path, model, chunk_length=30): audio_clip = whisper.load_audio(audio_path) audio_duration = len(audio_clip) / whisper.audio.SAMPLE_RATE # Calculate duration in seconds segments = [] for start in np.arange(0, audio_duration, chunk_length): end = min(start + chunk_length, audio_duration) segment = audio_clip[int(start * whisper.audio.SAMPLE_RATE):int(end * whisper.audio.SAMPLE_RATE)] result = model.transcribe(segment) segments.append(result['text']) return ' '.join(segments) # Function to translate text in chunks def translate_in_chunks(text, translator, max_length=500): words = text.split() chunks = [] current_chunk = "" for word in words: if len(current_chunk) + len(word) + 1 <= max_length: current_chunk += " " + word if current_chunk else word else: chunks.append(current_chunk) current_chunk = word if current_chunk: chunks.append(current_chunk) translated_chunks = [translator.translate(chunk) for chunk in chunks] return ' '.join(translated_chunks) # Transcribe audio using Whisper try: original_text = transcribe_audio_in_chunks(audio_path, whisper_model) st.write("Original Transcription:", original_text) # Translate text to the target language translator = Translator(to_lang=LANGUAGES[target_language]) translated_text = translate_in_chunks(original_text, translator) st.write(f"Translated Text ({target_language}):", translated_text) # Convert translated text to speech tts = gTTS(text=translated_text, lang=LANGUAGES[target_language]) audio_output_path = tempfile.mktemp(suffix=".mp3") tts.save(audio_output_path) # Display translated text and audio st.success("Translation successful!") st.audio(audio_output_path, format="audio/mp3") except Exception as e: st.error(f"Error during transcription/translation: {e}") audio_output_path = None # Ensure this variable is defined # Clean up temporary files os.remove(temp_video_path) os.remove(audio_path) if audio_output_path: # Only remove if it was created os.remove(audio_output_path)