Spaces:

shukdevdatta123
/

Video-Transcriber

Running

App Files Files Community

shukdevdatta123 commited on 10 days ago

Commit

ee377d8

verified ·

1 Parent(s): d1d87d3

Update app.py

Browse files

Files changed (1) hide show

app.py +463 -151

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-import streamlit as st #
 import moviepy.editor as mp
 import speech_recognition as sr
 from pydub import AudioSegment
@@ -7,194 +7,506 @@ import os
 import io
 from transformers import pipeline
 import matplotlib.pyplot as plt
-# Function to convert video to audio
-def video_to_audio(video_file):
-    # Load the video using moviepy
-    video = mp.VideoFileClip(video_file)
-    # Extract audio
-    audio = video.audio
-    temp_audio_path = tempfile.mktemp(suffix=".mp3")
-    # Write the audio to a file
-    audio.write_audiofile(temp_audio_path)
-    return temp_audio_path
 # Function to convert MP3 audio to WAV
 def convert_mp3_to_wav(mp3_file):
-    # Load the MP3 file using pydub
-    audio = AudioSegment.from_mp3(mp3_file)
-    # Create a temporary WAV file
-    temp_wav_path = tempfile.mktemp(suffix=".wav")
-    # Export the audio to the temporary WAV file
-    audio.export(temp_wav_path, format="wav")
-    return temp_wav_path
-# Function to transcribe audio to text
-def transcribe_audio(audio_file):
-    # Initialize recognizer
-    recognizer = sr.Recognizer()
-    # Load the audio file using speech_recognition
-    audio = sr.AudioFile(audio_file)
-    with audio as source:
-        audio_data = recognizer.record(source)
     try:
-        # Transcribe the audio data to text using Google Web Speech API
-        text = recognizer.recognize_google(audio_data)
-        return text
     except sr.UnknownValueError:
         return "Audio could not be understood."
-    except sr.RequestError:
-        return "Could not request results from Google Speech Recognition service."
 # Function to perform emotion detection using Hugging Face transformers
 def detect_emotion(text):
-    # Load emotion detection pipeline
-    emotion_pipeline = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", return_all_scores=True)
-    # Get the emotion predictions
-    result = emotion_pipeline(text)
-    # Extract the emotion with the highest score
-    emotions = {emotion['label']: emotion['score'] for emotion in result[0]}
-    return emotions
 # Streamlit app layout
-st.title("Video and Audio to Text Transcription with Emotion Detection and Visualization")
-st.write("Upload a video or audio file to convert it to transcription, detect emotions, and visualize the audio waveform.")
 # Create tabs to separate video and audio uploads
-tab = st.selectbox("Select the type of file to upload", ["Video", "Audio"])
-if tab == "Video":
-    # File uploader for video
-    uploaded_video = st.file_uploader("Upload Video", type=["mp4", "mov", "avi"])
     if uploaded_video is not None:
         # Save the uploaded video file temporarily
-        with tempfile.NamedTemporaryFile(delete=False) as tmp_video:
             tmp_video.write(uploaded_video.read())
             tmp_video_path = tmp_video.name
         # Add an "Analyze Video" button
-        if st.button("Analyze Video"):
-            with st.spinner("Processing video... Please wait."):
-                # Convert video to audio
-                audio_file = video_to_audio(tmp_video_path)
-                # Convert the extracted MP3 audio to WAV
-                wav_audio_file = convert_mp3_to_wav(audio_file)
-                # Transcribe audio to text
-                transcription = transcribe_audio(wav_audio_file)
-                # Show the transcription
-                st.text_area("Transcription", transcription, height=300)
-                # Emotion detection
-                emotions = detect_emotion(transcription)
-                st.write(f"Detected Emotions: {emotions}")
-                # Store transcription and audio file in session state
-                st.session_state.transcription = transcription
-                # Store the audio file as a BytesIO object in memory
-                with open(wav_audio_file, "rb") as f:
-                    audio_data = f.read()
-                    st.session_state.wav_audio_file = io.BytesIO(audio_data)
-                # Cleanup temporary files
-                os.remove(tmp_video_path)
-                os.remove(audio_file)
-    # Check if transcription and audio file are stored in session state
-    if 'transcription' in st.session_state and 'wav_audio_file' in st.session_state:
-        # Provide the audio file to the user for download
-        st.audio(st.session_state.wav_audio_file, format='audio/wav')
-        # Add download buttons for the transcription and audio
-        # Downloadable transcription file
-        st.download_button(
-            label="Download Transcription",
-            data=st.session_state.transcription,
-            file_name="transcription.txt",
-            mime="text/plain"
-        )
-        # Downloadable audio file
-        st.download_button(
-            label="Download Audio",
-            data=st.session_state.wav_audio_file,
-            file_name="converted_audio.wav",
-            mime="audio/wav"
-        )
-elif tab == "Audio":
     # File uploader for audio
-    uploaded_audio = st.file_uploader("Upload Audio", type=["wav", "mp3"])
     if uploaded_audio is not None:
         # Save the uploaded audio file temporarily
         with tempfile.NamedTemporaryFile(delete=False) as tmp_audio:
             tmp_audio.write(uploaded_audio.read())
             tmp_audio_path = tmp_audio.name
         # Add an "Analyze Audio" button
-        if st.button("Analyze Audio"):
-            with st.spinner("Processing audio... Please wait."):
-                # Convert audio to WAV if it's in MP3 format
-                if uploaded_audio.type == "audio/mpeg":
-                    wav_audio_file = convert_mp3_to_wav(tmp_audio_path)
-                else:
-                    wav_audio_file = tmp_audio_path
-                # Transcribe audio to text
-                transcription = transcribe_audio(wav_audio_file)
-                # Show the transcription
-                st.text_area("Transcription", transcription, height=300)
-                # Emotion detection
-                emotions = detect_emotion(transcription)
-                st.write(f"Detected Emotions: {emotions}")
-                # Store transcription in session state
-                st.session_state.transcription_audio = transcription
-                # Store the audio file as a BytesIO object in memory
-                with open(wav_audio_file, "rb") as f:
-                    audio_data = f.read()
-                    st.session_state.wav_audio_file_audio = io.BytesIO(audio_data)
-                # Cleanup temporary audio file
-                os.remove(tmp_audio_path)
-        # Check if transcription and audio file are stored in session state
-        if 'transcription_audio' in st.session_state and 'wav_audio_file_audio' in st.session_state:
-            # Provide the audio file to the user for download
-            st.audio(st.session_state.wav_audio_file_audio, format='audio/wav')
-            # Add download buttons for the transcription and audio
-            # Downloadable transcription file
-            st.download_button(
-                label="Download Transcription",
-                data=st.session_state.transcription_audio,
-                file_name="transcription_audio.txt",
-                mime="text/plain"
-            )
-            # Downloadable audio file
-            st.download_button(
-                label="Download Audio",
-                data=st.session_state.wav_audio_file_audio,
-                file_name="converted_audio_audio.wav",
-                mime="audio/wav"
-            )

+import streamlit as st
 import moviepy.editor as mp
 import speech_recognition as sr
 from pydub import AudioSegment
 import io
 from transformers import pipeline
 import matplotlib.pyplot as plt
+import gc
+import warnings
+warnings.filterwarnings("ignore")
+# Configure Streamlit for large file uploads
+st.set_page_config(
+    page_title="Video/Audio Transcription with Emotion Detection",
+    page_icon="🎬",
+    layout="wide"
+)
+# Set maximum upload size (this needs to be set before any file upload widgets)
+# Note: You'll also need to configure this in your Streamlit config file or environment
+@st.cache_data
+def get_config():
+    return {"maxUploadSize": 1024}  # 1GB in MB
+# Function to convert video to audio with progress tracking
+def video_to_audio(video_file, progress_callback=None):
+    """Convert video to audio with memory optimization"""
+    try:
+        # Load the video using moviepy with memory optimization
+        video = mp.VideoFileClip(video_file)
+        # Extract audio
+        audio = video.audio
+        temp_audio_path = tempfile.mktemp(suffix=".mp3")
+        # Write the audio to a file with progress tracking
+        if progress_callback:
+            progress_callback(50)  # 50% progress
+        audio.write_audiofile(temp_audio_path, verbose=False, logger=None)
+        # Clean up video object to free memory
+        audio.close()
+        video.close()
+        del video, audio
+        gc.collect()
+        if progress_callback:
+            progress_callback(100)  # 100% progress
+        return temp_audio_path
+    except Exception as e:
+        st.error(f"Error converting video to audio: {str(e)}")
+        return None
 # Function to convert MP3 audio to WAV
 def convert_mp3_to_wav(mp3_file):
+    """Convert MP3 to WAV with memory optimization"""
+    try:
+        # Load the MP3 file using pydub
+        audio = AudioSegment.from_mp3(mp3_file)
+        # Create a temporary WAV file
+        temp_wav_path = tempfile.mktemp(suffix=".wav")
+        # Export the audio to the temporary WAV file
+        audio.export(temp_wav_path, format="wav")
+        # Clean up to free memory
+        del audio
+        gc.collect()
+        return temp_wav_path
+    except Exception as e:
+        st.error(f"Error converting MP3 to WAV: {str(e)}")
+        return None
+# Function to transcribe audio to text with chunking for large files
+def transcribe_audio(audio_file, chunk_duration=60):
+    """Transcribe audio to text with chunking for large files"""
     try:
+        # Initialize recognizer
+        recognizer = sr.Recognizer()
+        # Load audio and get duration
+        audio_segment = AudioSegment.from_wav(audio_file)
+        duration = len(audio_segment) / 1000  # Duration in seconds
+        transcriptions = []
+        # If audio is longer than chunk_duration, split it
+        if duration > chunk_duration:
+            num_chunks = int(duration / chunk_duration) + 1
+            for i in range(num_chunks):
+                start_time = i * chunk_duration * 1000  # Convert to milliseconds
+                end_time = min((i + 1) * chunk_duration * 1000, len(audio_segment))
+                # Extract chunk
+                chunk = audio_segment[start_time:end_time]
+                # Save chunk temporarily
+                chunk_path = tempfile.mktemp(suffix=".wav")
+                chunk.export(chunk_path, format="wav")
+                # Transcribe chunk
+                try:
+                    with sr.AudioFile(chunk_path) as source:
+                        audio_data = recognizer.record(source)
+                        text = recognizer.recognize_google(audio_data)
+                        transcriptions.append(text)
+                except (sr.UnknownValueError, sr.RequestError):
+                    transcriptions.append(f"[Chunk {i+1}: Audio could not be transcribed]")
+                # Clean up chunk file
+                os.remove(chunk_path)
+                # Update progress
+                progress = int(((i + 1) / num_chunks) * 100)
+                st.progress(progress / 100, text=f"Transcribing... {progress}%")
+        else:
+            # For shorter audio, transcribe directly
+            with sr.AudioFile(audio_file) as source:
+                audio_data = recognizer.record(source)
+                text = recognizer.recognize_google(audio_data)
+                transcriptions.append(text)
+        # Join all transcriptions
+        full_transcription = " ".join(transcriptions)
+        # Clean up
+        del audio_segment
+        gc.collect()
+        return full_transcription
     except sr.UnknownValueError:
         return "Audio could not be understood."
+    except sr.RequestError as e:
+        return f"Could not request results from Google Speech Recognition service: {str(e)}"
+    except Exception as e:
+        return f"Error during transcription: {str(e)}"
 # Function to perform emotion detection using Hugging Face transformers
+@st.cache_resource
+def load_emotion_model():
+    """Load emotion detection model (cached)"""
+    return pipeline("text-classification",
+                   model="j-hartmann/emotion-english-distilroberta-base",
+                   return_all_scores=True)
 def detect_emotion(text):
+    """Detect emotions in text"""
+    try:
+        emotion_pipeline = load_emotion_model()
+        # Split text into chunks if it's too long (model has token limits)
+        max_length = 500
+        if len(text) > max_length:
+            chunks = [text[i:i+max_length] for i in range(0, len(text), max_length)]
+            all_emotions = {}
+            for chunk in chunks:
+                result = emotion_pipeline(chunk)
+                chunk_emotions = {emotion['label']: emotion['score'] for emotion in result[0]}
+                # Aggregate emotions
+                for emotion, score in chunk_emotions.items():
+                    if emotion in all_emotions:
+                        all_emotions[emotion] = (all_emotions[emotion] + score) / 2
+                    else:
+                        all_emotions[emotion] = score
+            return all_emotions
+        else:
+            result = emotion_pipeline(text)
+            emotions = {emotion['label']: emotion['score'] for emotion in result[0]}
+            return emotions
+    except Exception as e:
+        st.error(f"Error in emotion detection: {str(e)}")
+        return {"error": "Could not analyze emotions"}
+# Function to visualize emotions
+def plot_emotions(emotions):
+    """Create a bar chart of emotions"""
+    if "error" in emotions:
+        return None
+    fig, ax = plt.subplots(figsize=(10, 6))
+    emotions_sorted = dict(sorted(emotions.items(), key=lambda x: x[1], reverse=True))
+    colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4', '#FFEAA7', '#DDA0DD', '#98D8C8']
+    bars = ax.bar(emotions_sorted.keys(), emotions_sorted.values(),
+                  color=colors[:len(emotions_sorted)])
+    ax.set_xlabel('Emotions')
+    ax.set_ylabel('Confidence Score')
+    ax.set_title('Emotion Detection Results')
+    ax.set_ylim(0, 1)
+    # Add value labels on bars
+    for bar in bars:
+        height = bar.get_height()
+        ax.text(bar.get_x() + bar.get_width()/2., height + 0.01,
+                f'{height:.3f}', ha='center', va='bottom')
+    plt.xticks(rotation=45)
+    plt.tight_layout()
+    return fig
 # Streamlit app layout
+st.title("🎬 Video and Audio Transcription with Emotion Detection")
+st.write("Upload video files up to 1GB or audio files for transcription and emotion analysis.")
+# Display file size information
+st.info("📁 **File Size Limits**: Video files up to 1GB, Audio files up to 500MB")
+# Add instructions for large file uploads
+with st.expander("📋 Instructions for Large Files"):
+    st.write("""
+    **For optimal performance with large files:**
+    1. Ensure stable internet connection
+    2. Be patient - large files take time to process
+    3. Don't close the browser tab during processing
+    4. For very large files, consider splitting them beforehand
+    **Supported formats:**
+    - **Video**: MP4, MOV, AVI
+    - **Audio**: WAV, MP3
+    """)
 # Create tabs to separate video and audio uploads
+tab1, tab2 = st.tabs(["📹 Video Upload", "🎵 Audio Upload"])
+with tab1:
+    st.header("Video File Processing")
+    # File uploader for video with increased size limit
+    uploaded_video = st.file_uploader(
+        "Upload Video File",
+        type=["mp4", "mov", "avi"],
+        help="Maximum file size: 1GB"
+    )
     if uploaded_video is not None:
+        # Display file information
+        file_size_mb = uploaded_video.size / (1024 * 1024)
+        st.info(f"📊 **File Info**: {uploaded_video.name} ({file_size_mb:.1f} MB)")
+        # Show video preview for smaller files
+        if file_size_mb < 100:  # Only show preview for files under 100MB
+            st.video(uploaded_video)
         # Save the uploaded video file temporarily
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as tmp_video:
             tmp_video.write(uploaded_video.read())
             tmp_video_path = tmp_video.name
         # Add an "Analyze Video" button
+        if st.button("🔄 Analyze Video", type="primary"):
+            progress_bar = st.progress(0)
+            status_text = st.empty()
+            try:
+                with st.spinner("Processing video... This may take several minutes for large files."):
+                    status_text.text("Step 1/4: Converting video to audio...")
+                    progress_bar.progress(10)
+                    # Convert video to audio
+                    audio_file = video_to_audio(tmp_video_path,
+                                              lambda p: progress_bar.progress(10 + p * 0.3))
+                    if audio_file is None:
+                        st.error("Failed to extract audio from video.")
+                        st.stop()
+                    status_text.text("Step 2/4: Converting audio format...")
+                    progress_bar.progress(50)
+                    # Convert the extracted MP3 audio to WAV
+                    wav_audio_file = convert_mp3_to_wav(audio_file)
+                    if wav_audio_file is None:
+                        st.error("Failed to convert audio format.")
+                        st.stop()
+                    status_text.text("Step 3/4: Transcribing audio to text...")
+                    progress_bar.progress(60)
+                    # Transcribe audio to text
+                    transcription = transcribe_audio(wav_audio_file)
+                    status_text.text("Step 4/4: Analyzing emotions...")
+                    progress_bar.progress(90)
+                    # Emotion detection
+                    emotions = detect_emotion(transcription)
+                    progress_bar.progress(100)
+                    status_text.text("✅ Processing complete!")
+                    # Display results
+                    st.success("Analysis completed successfully!")
+                    # Show the transcription
+                    st.subheader("📝 Transcription")
+                    st.text_area("", transcription, height=300, key="video_transcription")
+                    # Show emotions
+                    st.subheader("😊 Emotion Analysis")
+                    col1, col2 = st.columns([1, 1])
+                    with col1:
+                        st.write("**Detected Emotions:**")
+                        for emotion, score in emotions.items():
+                            st.write(f"- **{emotion.title()}**: {score:.3f}")
+                    with col2:
+                        fig = plot_emotions(emotions)
+                        if fig:
+                            st.pyplot(fig)
+                    # Store results in session state
+                    st.session_state.video_transcription = transcription
+                    st.session_state.video_emotions = emotions
+                    # Store the audio file as a BytesIO object in memory
+                    with open(wav_audio_file, "rb") as f:
+                        audio_data = f.read()
+                        st.session_state.video_wav_audio_file = io.BytesIO(audio_data)
+                    # Cleanup temporary files
+                    os.remove(tmp_video_path)
+                    os.remove(audio_file)
+                    os.remove(wav_audio_file)
+            except Exception as e:
+                st.error(f"An error occurred during processing: {str(e)}")
+                # Clean up files in case of error
+                try:
+                    os.remove(tmp_video_path)
+                    if 'audio_file' in locals() and audio_file:
+                        os.remove(audio_file)
+                    if 'wav_audio_file' in locals() and wav_audio_file:
+                        os.remove(wav_audio_file)
+                except:
+                    pass
+    # Check if results are stored in session state
+    if 'video_transcription' in st.session_state and 'video_wav_audio_file' in st.session_state:
+        st.subheader("📥 Download Results")
+        col1, col2, col3 = st.columns(3)
+        with col1:
+            # Provide the audio file to the user for playback
+            st.audio(st.session_state.video_wav_audio_file, format='audio/wav')
+        with col2:
+            # Downloadable transcription file
+            st.download_button(
+                label="📄 Download Transcription",
+                data=st.session_state.video_transcription,
+                file_name="video_transcription.txt",
+                mime="text/plain"
+            )
+        with col3:
+            # Downloadable audio file
+            st.download_button(
+                label="🎵 Download Audio",
+                data=st.session_state.video_wav_audio_file,
+                file_name="extracted_audio.wav",
+                mime="audio/wav"
+            )
+with tab2:
+    st.header("Audio File Processing")
     # File uploader for audio
+    uploaded_audio = st.file_uploader(
+        "Upload Audio File",
+        type=["wav", "mp3"],
+        help="Maximum file size: 500MB"
+    )
     if uploaded_audio is not None:
+        # Display file information
+        file_size_mb = uploaded_audio.size / (1024 * 1024)
+        st.info(f"📊 **File Info**: {uploaded_audio.name} ({file_size_mb:.1f} MB)")
+        # Show audio player
+        st.audio(uploaded_audio)
         # Save the uploaded audio file temporarily
         with tempfile.NamedTemporaryFile(delete=False) as tmp_audio:
             tmp_audio.write(uploaded_audio.read())
             tmp_audio_path = tmp_audio.name
         # Add an "Analyze Audio" button
+        if st.button("🔄 Analyze Audio", type="primary"):
+            progress_bar = st.progress(0)
+            status_text = st.empty()
+            try:
+                with st.spinner("Processing audio... Please wait."):
+                    status_text.text("Step 1/3: Converting audio format...")
+                    progress_bar.progress(20)
+                    # Convert audio to WAV if it's in MP3 format
+                    if uploaded_audio.type == "audio/mpeg":
+                        wav_audio_file = convert_mp3_to_wav(tmp_audio_path)
+                    else:
+                        wav_audio_file = tmp_audio_path
+                    if wav_audio_file is None:
+                        st.error("Failed to process audio file.")
+                        st.stop()
+                    status_text.text("Step 2/3: Transcribing audio to text...")
+                    progress_bar.progress(40)
+                    # Transcribe audio to text
+                    transcription = transcribe_audio(wav_audio_file)
+                    status_text.text("Step 3/3: Analyzing emotions...")
+                    progress_bar.progress(80)
+                    # Emotion detection
+                    emotions = detect_emotion(transcription)
+                    progress_bar.progress(100)
+                    status_text.text("✅ Processing complete!")
+                    # Display results
+                    st.success("Analysis completed successfully!")
+                    # Show the transcription
+                    st.subheader("📝 Transcription")
+                    st.text_area("", transcription, height=300, key="audio_transcription")
+                    # Show emotions
+                    st.subheader("😊 Emotion Analysis")
+                    col1, col2 = st.columns([1, 1])
+                    with col1:
+                        st.write("**Detected Emotions:**")
+                        for emotion, score in emotions.items():
+                            st.write(f"- **{emotion.title()}**: {score:.3f}")
+                    with col2:
+                        fig = plot_emotions(emotions)
+                        if fig:
+                            st.pyplot(fig)
+                    # Store results in session state
+                    st.session_state.audio_transcription = transcription
+                    st.session_state.audio_emotions = emotions
+                    # Store the audio file as a BytesIO object in memory
+                    with open(wav_audio_file, "rb") as f:
+                        audio_data = f.read()
+                        st.session_state.audio_wav_audio_file = io.BytesIO(audio_data)
+                    # Cleanup temporary audio file
+                    os.remove(tmp_audio_path)
+                    if wav_audio_file != tmp_audio_path:
+                        os.remove(wav_audio_file)
+            except Exception as e:
+                st.error(f"An error occurred during processing: {str(e)}")
+                # Clean up files in case of error
+                try:
+                    os.remove(tmp_audio_path)
+                    if 'wav_audio_file' in locals() and wav_audio_file and wav_audio_file != tmp_audio_path:
+                        os.remove(wav_audio_file)
+                except:
+                    pass
+        # Check if results are stored in session state
+        if 'audio_transcription' in st.session_state and 'audio_wav_audio_file' in st.session_state:
+            st.subheader("📥 Download Results")
+            col1, col2 = st.columns(2)
+            with col1:
+                # Downloadable transcription file
+                st.download_button(
+                    label="📄 Download Transcription",
+                    data=st.session_state.audio_transcription,
+                    file_name="audio_transcription.txt",
+                    mime="text/plain"
+                )
+            with col2:
+                # Downloadable audio file
+                st.download_button(
+                    label="🎵 Download Processed Audio",
+                    data=st.session_state.audio_wav_audio_file,
+                    file_name="processed_audio.wav",
+                    mime="audio/wav"
+                )
+# Footer
+st.markdown("---")
+st.markdown("Built with ❤️ using Streamlit, MoviePy, and HuggingFace Transformers")