Spaces:
Runtime error
Runtime error
import math | |
from io import BytesIO | |
import gradio as gr | |
import cv2 | |
import requests | |
from pydub import AudioSegment | |
from faster_whisper import WhisperModel | |
model = WhisperModel("small", device="cpu", compute_type="int8") | |
API_KEY = os.getenv("API_KEY") | |
FACE_API_URL = "https://api-inference.huggingface.co/models/dima806/facial_emotions_image_detection" | |
TEXT_API_URL = "https://api-inference.huggingface.co/models/SamLowe/roberta-base-go_emotions" | |
headers = {"Authorization": "Bearer " + API_KEY + ""} | |
def extract_frames(video_path): | |
cap = cv2.VideoCapture(video_path) | |
fps = int(cap.get(cv2.CAP_PROP_FPS)) | |
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) | |
interval = fps | |
result = [] | |
for i in range(0, total_frames, interval): | |
cap.set(cv2.CAP_PROP_POS_FRAMES, i) | |
ret, frame = cap.read() | |
if ret: | |
_, img_encoded = cv2.imencode('.jpg', frame) | |
img_bytes = img_encoded.tobytes() | |
response = requests.post(FACE_API_URL, headers=headers, data=img_bytes) | |
result.append({item['label']: item['score'] for item in response.json()}) | |
print("Frame extraction completed.") | |
cap.release() | |
print(result) | |
return result | |
def analyze_sentiment(text): | |
response = requests.post(TEXT_API_URL, headers=headers, json=text) | |
print(response.json()) | |
sentiment_list = response.json()[0] | |
print(sentiment_list) | |
sentiment_results = {result['label']: result['score'] for result in sentiment_list} | |
return sentiment_results | |
def video_to_audio(input_video): | |
audio = AudioSegment.from_file('test_video_1.mp4') | |
audio_binary = audio.export(format="wav").read() | |
audio_bytesio = BytesIO(audio_binary) | |
segments, info = model.transcribe(audio_bytesio, beam_size=5) | |
print("Detected language '%s' with probability %f" % (info.language, info.language_probability)) | |
frames_sentiments = extract_frames(input_video) | |
transcript = '' | |
final_output = [] | |
for segment in segments: | |
transcript = transcript + segment.text + " " | |
print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text)) | |
transcript_segment_sentiment = analyze_sentiment(segment.text) | |
emotion_totals = { | |
'admiration': 0.0, | |
'amusement': 0.0, | |
'angry': 0.0, | |
'annoyance': 0.0, | |
'approval': 0.0, | |
'caring': 0.0, | |
'confusion': 0.0, | |
'curiosity': 0.0, | |
'desire': 0.0, | |
'disappointment': 0.0, | |
'disapproval': 0.0, | |
'disgust': 0.0, | |
'embarrassment': 0.0, | |
'excitement': 0.0, | |
'fear': 0.0, | |
'gratitude': 0.0, | |
'grief': 0.0, | |
'happy': 0.0, | |
'love': 0.0, | |
'nervousness': 0.0, | |
'optimism': 0.0, | |
'pride': 0.0, | |
'realization': 0.0, | |
'relief': 0.0, | |
'remorse': 0.0, | |
'sad': 0.0, | |
'surprise': 0.0, | |
'neutral': 0.0 | |
} | |
counter = 0 | |
for i in range(math.ceil(segment.start), math.floor(segment.end)): | |
for emotion in frames_sentiments[i].keys(): | |
emotion_totals[emotion] += frames_sentiments[i].get(emotion) | |
counter += 1 | |
for emotion in emotion_totals: | |
emotion_totals[emotion] /= counter | |
video_segment_sentiment = emotion_totals | |
segment_finals = {segment.id: (segment.text, segment.start, segment.end, transcript_segment_sentiment, | |
video_segment_sentiment)} | |
final_output.append(segment_finals) | |
print(segment_finals) | |
print(final_output) | |
print(final_output) | |
return final_output | |
gr.Interface( | |
fn=video_to_audio, | |
inputs=gr.Video(sources=["upload"]), | |
outputs=gr.Textbox() | |
).launch() |