import math from io import BytesIO import gradio as gr import cv2 import requests from pydub import AudioSegment from faster_whisper import WhisperModel model = WhisperModel("small", device="cpu", compute_type="int8") API_KEY = os.getenv("API_KEY") FACE_API_URL = "https://api-inference.huggingface.co/models/dima806/facial_emotions_image_detection" TEXT_API_URL = "https://api-inference.huggingface.co/models/SamLowe/roberta-base-go_emotions" headers = {"Authorization": "Bearer " + API_KEY + ""} def extract_frames(video_path): cap = cv2.VideoCapture(video_path) fps = int(cap.get(cv2.CAP_PROP_FPS)) total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) interval = fps result = [] for i in range(0, total_frames, interval): cap.set(cv2.CAP_PROP_POS_FRAMES, i) ret, frame = cap.read() if ret: _, img_encoded = cv2.imencode('.jpg', frame) img_bytes = img_encoded.tobytes() response = requests.post(FACE_API_URL, headers=headers, data=img_bytes) result.append({item['label']: item['score'] for item in response.json()}) print("Frame extraction completed.") cap.release() print(result) return result def analyze_sentiment(text): response = requests.post(TEXT_API_URL, headers=headers, json=text) print(response.json()) sentiment_list = response.json()[0] print(sentiment_list) sentiment_results = {result['label']: result['score'] for result in sentiment_list} return sentiment_results def video_to_audio(input_video): audio = AudioSegment.from_file('test_video_1.mp4') audio_binary = audio.export(format="wav").read() audio_bytesio = BytesIO(audio_binary) segments, info = model.transcribe(audio_bytesio, beam_size=5) print("Detected language '%s' with probability %f" % (info.language, info.language_probability)) frames_sentiments = extract_frames(input_video) transcript = '' final_output = [] for segment in segments: transcript = transcript + segment.text + " " print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text)) transcript_segment_sentiment = analyze_sentiment(segment.text) emotion_totals = { 'admiration': 0.0, 'amusement': 0.0, 'angry': 0.0, 'annoyance': 0.0, 'approval': 0.0, 'caring': 0.0, 'confusion': 0.0, 'curiosity': 0.0, 'desire': 0.0, 'disappointment': 0.0, 'disapproval': 0.0, 'disgust': 0.0, 'embarrassment': 0.0, 'excitement': 0.0, 'fear': 0.0, 'gratitude': 0.0, 'grief': 0.0, 'happy': 0.0, 'love': 0.0, 'nervousness': 0.0, 'optimism': 0.0, 'pride': 0.0, 'realization': 0.0, 'relief': 0.0, 'remorse': 0.0, 'sad': 0.0, 'surprise': 0.0, 'neutral': 0.0 } counter = 0 for i in range(math.ceil(segment.start), math.floor(segment.end)): for emotion in frames_sentiments[i].keys(): emotion_totals[emotion] += frames_sentiments[i].get(emotion) counter += 1 for emotion in emotion_totals: emotion_totals[emotion] /= counter video_segment_sentiment = emotion_totals segment_finals = {segment.id: (segment.text, segment.start, segment.end, transcript_segment_sentiment, video_segment_sentiment)} final_output.append(segment_finals) print(segment_finals) print(final_output) print(final_output) return final_output gr.Interface( fn=video_to_audio, inputs=gr.Video(sources=["upload"]), outputs=gr.Textbox() ).launch()