File size: 6,486 Bytes
ee377d8 efa4923 972a238 efa4923 972a238 ee377d8 972a238 ee377d8 972a238 ee377d8 972a238 efa4923 972a238 efa4923 972a238 efa4923 972a238 efa4923 972a238 efa4923 972a238 efa4923 972a238 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
import streamlit as st
import moviepy.editor as mp
import speech_recognition as sr
from pydub import AudioSegment
import tempfile
import os
import io
from transformers import pipeline
import matplotlib.pyplot as plt
import librosa
import numpy as np
# Function to convert video to audio
def video_to_audio(video_file):
video = mp.VideoFileClip(video_file)
audio = video.audio
temp_audio_path = tempfile.mktemp(suffix=".mp3")
audio.write_audiofile(temp_audio_path)
return temp_audio_path
# Function to convert MP3 to WAV
def convert_mp3_to_wav(mp3_file):
audio = AudioSegment.from_mp3(mp3_file)
temp_wav_path = tempfile.mktemp(suffix=".wav")
audio.export(temp_wav_path, format="wav")
return temp_wav_path
# Function to transcribe audio with chunking for large files
def transcribe_audio(audio_file):
audio = AudioSegment.from_wav(audio_file)
duration = len(audio) / 1000 # Duration in seconds
chunk_length = 60 # 60-second chunks
recognizer = sr.Recognizer()
if duration <= chunk_length:
with sr.AudioFile(audio_file) as source:
audio_data = recognizer.record(source)
try:
text = recognizer.recognize_google(audio_data)
return text
except sr.UnknownValueError:
return "Audio could not be understood."
except sr.RequestError:
return "Could not request results from Google Speech Recognition service."
else:
num_chunks = int(duration // chunk_length) + 1
transcriptions = []
for i in range(num_chunks):
start_time = i * chunk_length * 1000 # in milliseconds
end_time = min((i + 1) * chunk_length * 1000, len(audio))
chunk = audio[start_time:end_time]
frame_data = chunk.raw_data
sample_rate = audio.frame_rate
sample_width = audio.sample_width
audio_data = sr.AudioData(frame_data, sample_rate, sample_width)
try:
text = recognizer.recognize_google(audio_data)
transcriptions.append(text)
except sr.UnknownValueError:
transcriptions.append("[Audio could not be understood.]")
except sr.RequestError:
transcriptions.append("[Could not request results.]")
return " ".join(transcriptions)
# Function to detect emotions
def detect_emotion(text):
emotion_pipeline = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", return_all_scores=True)
result = emotion_pipeline(text)
emotions = {emotion['label']: emotion['score'] for emotion in result[0]}
return emotions
# Function to plot audio waveform
def plot_waveform(audio_data, duration=10):
audio_data.seek(0)
y, sr = librosa.load(audio_data, sr=None, duration=duration)
plt.figure(figsize=(10, 4))
time = np.linspace(0, len(y)/sr, len(y))
plt.plot(time, y)
plt.title(f"Audio Waveform (first {duration} seconds)")
plt.xlabel("Time (s)")
plt.ylabel("Amplitude")
st.pyplot(plt)
# Streamlit app layout
st.title("Video and Audio to Text Transcription with Emotion Detection and Visualization")
st.write("Upload a video or audio file to transcribe it, detect emotions, and visualize the audio waveform.")
st.write("**Note:** To upload files up to 1GB, run the app with: `streamlit run app.py --server.maxUploadSize=1024`")
tab = st.selectbox("Select file type", ["Video", "Audio"])
if tab == "Video":
uploaded_video = st.file_uploader("Upload Video", type=["mp4", "mov", "avi"])
if uploaded_video:
with tempfile.NamedTemporaryFile(delete=False) as tmp_video:
tmp_video.write(uploaded_video.read())
tmp_video_path = tmp_video.name
if st.button("Analyze Video"):
with st.spinner("Processing video..."):
audio_file = video_to_audio(tmp_video_path)
wav_audio_file = convert_mp3_to_wav(audio_file)
transcription = transcribe_audio(wav_audio_file)
st.text_area("Transcription", transcription, height=300)
emotions = detect_emotion(transcription)
st.write(f"Detected Emotions: {emotions}")
with open(wav_audio_file, "rb") as f:
audio_data = io.BytesIO(f.read())
st.session_state.wav_audio_file = audio_data
plot_waveform(st.session_state.wav_audio_file)
os.remove(tmp_video_path)
os.remove(audio_file)
os.remove(wav_audio_file)
if 'wav_audio_file' in st.session_state:
st.audio(st.session_state.wav_audio_file, format='audio/wav')
st.download_button("Download Transcription", st.session_state.transcription, "transcription.txt", "text/plain")
st.download_button("Download Audio", st.session_state.wav_audio_file, "converted_audio.wav", "audio/wav")
elif tab == "Audio":
uploaded_audio = st.file_uploader("Upload Audio", type=["wav", "mp3"])
if uploaded_audio:
with tempfile.NamedTemporaryFile(delete=False) as tmp_audio:
tmp_audio.write(uploaded_audio.read())
tmp_audio_path = tmp_audio.name
if st.button("Analyze Audio"):
with st.spinner("Processing audio..."):
wav_audio_file = convert_mp3_to_wav(tmp_audio_path) if uploaded_audio.type == "audio/mpeg" else tmp_audio_path
transcription = transcribe_audio(wav_audio_file)
st.text_area("Transcription", transcription, height=300)
emotions = detect_emotion(transcription)
st.write(f"Detected Emotions: {emotions}")
with open(wav_audio_file, "rb") as f:
audio_data = io.BytesIO(f.read())
st.session_state.wav_audio_file_audio = audio_data
plot_waveform(st.session_state.wav_audio_file_audio)
if uploaded_audio.type == "audio/mpeg":
os.remove(wav_audio_file)
os.remove(tmp_audio_path)
if 'wav_audio_file_audio' in st.session_state:
st.audio(st.session_state.wav_audio_file_audio, format='audio/wav')
st.download_button("Download Transcription", st.session_state.transcription_audio, "transcription_audio.txt", "text/plain")
st.download_button("Download Audio", st.session_state.wav_audio_file_audio, "converted_audio_audio.wav", "audio/wav") |