Spaces:
Runtime error
Runtime error
Commit
·
1be32a3
1
Parent(s):
909f75a
made some changes to app.py(oauth2)
Browse files
app.py
CHANGED
|
@@ -11,10 +11,15 @@ from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
|
| 11 |
from transformers import BlipProcessor, BlipForConditionalGeneration
|
| 12 |
import cv2
|
| 13 |
|
|
|
|
|
|
|
|
|
|
| 14 |
def download_youtube_video(video_url, output_path):
|
| 15 |
ydl_opts = {
|
| 16 |
'format': 'bestvideo+bestaudio',
|
| 17 |
'outtmpl': os.path.join(output_path, '%(title)s.%(ext)s'),
|
|
|
|
|
|
|
| 18 |
}
|
| 19 |
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
| 20 |
ydl.download([video_url])
|
|
@@ -92,21 +97,32 @@ emotion_model = AutoModelForSequenceClassification.from_pretrained(emotion_model
|
|
| 92 |
def analyze_video(video_url):
|
| 93 |
global output_path
|
| 94 |
output_path = './'
|
|
|
|
|
|
|
|
|
|
| 95 |
video_path = download_youtube_video(video_url, output_path)
|
|
|
|
| 96 |
mp4_path = convert_to_mp4(video_path, output_path)
|
|
|
|
| 97 |
audio_path = extract_audio_from_video(mp4_path)
|
|
|
|
| 98 |
audio_wav_path = convert_mp3_to_wav(audio_path)
|
|
|
|
| 99 |
model_whisper = whisper.load_model("base")
|
| 100 |
|
| 101 |
result_whisper = model_whisper.transcribe(audio_wav_path)
|
| 102 |
|
| 103 |
transcript = result_whisper['text']
|
|
|
|
| 104 |
emotion_dict_text, predicted_emotion_text = process_text(transcript)
|
| 105 |
|
| 106 |
n_frame_interval = 60
|
| 107 |
emotion_vectors_video = []
|
|
|
|
| 108 |
video_capture = cv2.VideoCapture(mp4_path)
|
|
|
|
| 109 |
total_frames_video = int(video_capture.get(cv2.CAP_PROP_FRAME_COUNT))
|
|
|
|
| 110 |
frame_count_video = 0
|
| 111 |
|
| 112 |
while video_capture.isOpened():
|
|
@@ -118,7 +134,7 @@ def analyze_video(video_url):
|
|
| 118 |
if frame_count_video % n_frame_interval == 0:
|
| 119 |
pixel_values_video = preprocess_frame(frame_video)
|
| 120 |
caption_video = generate_caption(pixel_values_video)
|
| 121 |
-
predicted_emotions_video
|
| 122 |
emotion_vectors_video.append(np.array(list(predicted_emotions_video.values())))
|
| 123 |
|
| 124 |
frame_count_video += 1
|
|
@@ -126,8 +142,11 @@ def analyze_video(video_url):
|
|
| 126 |
video_capture.release()
|
| 127 |
|
| 128 |
average_emotion_vector_video = np.mean(emotion_vectors_video, axis=0)
|
|
|
|
| 129 |
combined_emotion_vector_final = np.concatenate((np.array(list(emotion_dict_text.values())), average_emotion_vector_video))
|
|
|
|
| 130 |
final_most_predicted_index = np.argmax(combined_emotion_vector_final)
|
|
|
|
| 131 |
final_most_predicted_emotion = list(emotion_dict_text.keys())[final_most_predicted_index]
|
| 132 |
|
| 133 |
return transcript, predicted_emotion_text, final_most_predicted_emotion
|
|
@@ -139,4 +158,4 @@ iface = gr.Interface(fn=analyze_video,
|
|
| 139 |
description="Enter a YouTube Video URL to analyze emotions from both audio and visual content.")
|
| 140 |
|
| 141 |
if __name__ == "__main__":
|
| 142 |
-
|
|
|
|
| 11 |
from transformers import BlipProcessor, BlipForConditionalGeneration
|
| 12 |
import cv2
|
| 13 |
|
| 14 |
+
def authenticate_youtube():
|
| 15 |
+
os.system('yt-dlp --username oauth2 --password ""')
|
| 16 |
+
|
| 17 |
def download_youtube_video(video_url, output_path):
|
| 18 |
ydl_opts = {
|
| 19 |
'format': 'bestvideo+bestaudio',
|
| 20 |
'outtmpl': os.path.join(output_path, '%(title)s.%(ext)s'),
|
| 21 |
+
'username': 'oauth2',
|
| 22 |
+
'password': ''
|
| 23 |
}
|
| 24 |
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
| 25 |
ydl.download([video_url])
|
|
|
|
| 97 |
def analyze_video(video_url):
|
| 98 |
global output_path
|
| 99 |
output_path = './'
|
| 100 |
+
|
| 101 |
+
authenticate_youtube()
|
| 102 |
+
|
| 103 |
video_path = download_youtube_video(video_url, output_path)
|
| 104 |
+
|
| 105 |
mp4_path = convert_to_mp4(video_path, output_path)
|
| 106 |
+
|
| 107 |
audio_path = extract_audio_from_video(mp4_path)
|
| 108 |
+
|
| 109 |
audio_wav_path = convert_mp3_to_wav(audio_path)
|
| 110 |
+
|
| 111 |
model_whisper = whisper.load_model("base")
|
| 112 |
|
| 113 |
result_whisper = model_whisper.transcribe(audio_wav_path)
|
| 114 |
|
| 115 |
transcript = result_whisper['text']
|
| 116 |
+
|
| 117 |
emotion_dict_text, predicted_emotion_text = process_text(transcript)
|
| 118 |
|
| 119 |
n_frame_interval = 60
|
| 120 |
emotion_vectors_video = []
|
| 121 |
+
|
| 122 |
video_capture = cv2.VideoCapture(mp4_path)
|
| 123 |
+
|
| 124 |
total_frames_video = int(video_capture.get(cv2.CAP_PROP_FRAME_COUNT))
|
| 125 |
+
|
| 126 |
frame_count_video = 0
|
| 127 |
|
| 128 |
while video_capture.isOpened():
|
|
|
|
| 134 |
if frame_count_video % n_frame_interval == 0:
|
| 135 |
pixel_values_video = preprocess_frame(frame_video)
|
| 136 |
caption_video = generate_caption(pixel_values_video)
|
| 137 |
+
predicted_emotions_video = predict_emotions(caption_video)
|
| 138 |
emotion_vectors_video.append(np.array(list(predicted_emotions_video.values())))
|
| 139 |
|
| 140 |
frame_count_video += 1
|
|
|
|
| 142 |
video_capture.release()
|
| 143 |
|
| 144 |
average_emotion_vector_video = np.mean(emotion_vectors_video, axis=0)
|
| 145 |
+
|
| 146 |
combined_emotion_vector_final = np.concatenate((np.array(list(emotion_dict_text.values())), average_emotion_vector_video))
|
| 147 |
+
|
| 148 |
final_most_predicted_index = np.argmax(combined_emotion_vector_final)
|
| 149 |
+
|
| 150 |
final_most_predicted_emotion = list(emotion_dict_text.keys())[final_most_predicted_index]
|
| 151 |
|
| 152 |
return transcript, predicted_emotion_text, final_most_predicted_emotion
|
|
|
|
| 158 |
description="Enter a YouTube Video URL to analyze emotions from both audio and visual content.")
|
| 159 |
|
| 160 |
if __name__ == "__main__":
|
| 161 |
+
iface.launch()
|