Spaces:
Runtime error
Runtime error
| import math | |
| from io import BytesIO | |
| import gradio as gr | |
| import cv2 | |
| import requests | |
| from pydub import AudioSegment | |
| from faster_whisper import WhisperModel | |
| model = WhisperModel("small", device="cpu", compute_type="int8") | |
| API_KEY = os.getenv("API_KEY") | |
| FACE_API_URL = "https://api-inference.huggingface.co/models/dima806/facial_emotions_image_detection" | |
| TEXT_API_URL = "https://api-inference.huggingface.co/models/SamLowe/roberta-base-go_emotions" | |
| headers = {"Authorization": "Bearer " + API_KEY + ""} | |
| def extract_frames(video_path): | |
| cap = cv2.VideoCapture(video_path) | |
| fps = int(cap.get(cv2.CAP_PROP_FPS)) | |
| total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) | |
| interval = fps | |
| result = [] | |
| for i in range(0, total_frames, interval): | |
| cap.set(cv2.CAP_PROP_POS_FRAMES, i) | |
| ret, frame = cap.read() | |
| if ret: | |
| _, img_encoded = cv2.imencode('.jpg', frame) | |
| img_bytes = img_encoded.tobytes() | |
| response = requests.post(FACE_API_URL, headers=headers, data=img_bytes) | |
| result.append({item['label']: item['score'] for item in response.json()}) | |
| print("Frame extraction completed.") | |
| cap.release() | |
| print(result) | |
| return result | |
| def analyze_sentiment(text): | |
| response = requests.post(TEXT_API_URL, headers=headers, json=text) | |
| print(response.json()) | |
| sentiment_list = response.json()[0] | |
| print(sentiment_list) | |
| sentiment_results = {result['label']: result['score'] for result in sentiment_list} | |
| return sentiment_results | |
| def video_to_audio(input_video): | |
| audio = AudioSegment.from_file('test_video_1.mp4') | |
| audio_binary = audio.export(format="wav").read() | |
| audio_bytesio = BytesIO(audio_binary) | |
| segments, info = model.transcribe(audio_bytesio, beam_size=5) | |
| print("Detected language '%s' with probability %f" % (info.language, info.language_probability)) | |
| frames_sentiments = extract_frames(input_video) | |
| transcript = '' | |
| final_output = [] | |
| for segment in segments: | |
| transcript = transcript + segment.text + " " | |
| print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text)) | |
| transcript_segment_sentiment = analyze_sentiment(segment.text) | |
| emotion_totals = { | |
| 'admiration': 0.0, | |
| 'amusement': 0.0, | |
| 'angry': 0.0, | |
| 'annoyance': 0.0, | |
| 'approval': 0.0, | |
| 'caring': 0.0, | |
| 'confusion': 0.0, | |
| 'curiosity': 0.0, | |
| 'desire': 0.0, | |
| 'disappointment': 0.0, | |
| 'disapproval': 0.0, | |
| 'disgust': 0.0, | |
| 'embarrassment': 0.0, | |
| 'excitement': 0.0, | |
| 'fear': 0.0, | |
| 'gratitude': 0.0, | |
| 'grief': 0.0, | |
| 'happy': 0.0, | |
| 'love': 0.0, | |
| 'nervousness': 0.0, | |
| 'optimism': 0.0, | |
| 'pride': 0.0, | |
| 'realization': 0.0, | |
| 'relief': 0.0, | |
| 'remorse': 0.0, | |
| 'sad': 0.0, | |
| 'surprise': 0.0, | |
| 'neutral': 0.0 | |
| } | |
| counter = 0 | |
| for i in range(math.ceil(segment.start), math.floor(segment.end)): | |
| for emotion in frames_sentiments[i].keys(): | |
| emotion_totals[emotion] += frames_sentiments[i].get(emotion) | |
| counter += 1 | |
| for emotion in emotion_totals: | |
| emotion_totals[emotion] /= counter | |
| video_segment_sentiment = emotion_totals | |
| segment_finals = {segment.id: (segment.text, segment.start, segment.end, transcript_segment_sentiment, | |
| video_segment_sentiment)} | |
| final_output.append(segment_finals) | |
| print(segment_finals) | |
| print(final_output) | |
| print(final_output) | |
| return final_output | |
| gr.Interface( | |
| fn=video_to_audio, | |
| inputs=gr.Video(sources=["upload"]), | |
| outputs=gr.Textbox() | |
| ).launch() |