Test_Video / app.py
ParthCodes's picture
Update app.py
9e75130 verified
raw
history blame
3.9 kB
import math
from io import BytesIO
import gradio as gr
import cv2
import os
import requests
from pydub import AudioSegment
from faster_whisper import WhisperModel
model = WhisperModel("small", device="cpu", compute_type="int8")
API_KEY = os.getenv("API_KEY")
FACE_API_URL = "https://api-inference.huggingface.co/models/dima806/facial_emotions_image_detection"
TEXT_API_URL = "https://api-inference.huggingface.co/models/SamLowe/roberta-base-go_emotions"
headers = {"Authorization": "Bearer " + API_KEY + ""}
def extract_frames(video_path):
cap = cv2.VideoCapture(video_path)
fps = int(cap.get(cv2.CAP_PROP_FPS))
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
interval = fps
result = []
for i in range(0, total_frames, interval):
cap.set(cv2.CAP_PROP_POS_FRAMES, i)
ret, frame = cap.read()
if ret:
_, img_encoded = cv2.imencode('.jpg', frame)
img_bytes = img_encoded.tobytes()
response = requests.post(FACE_API_URL, headers=headers, data=img_bytes)
result.append({item['label']: item['score'] for item in response.json()})
print("Frame extraction completed.")
cap.release()
print(result)
return result
def analyze_sentiment(text):
response = requests.post(TEXT_API_URL, headers=headers, json=text)
print(response.json())
sentiment_list = response.json()[0]
print(sentiment_list)
sentiment_results = {result['label']: result['score'] for result in sentiment_list}
return sentiment_results
def video_to_audio(input_video):
audio = AudioSegment.from_file(input_video)
audio_binary = audio.export(format="wav").read()
audio_bytesio = BytesIO(audio_binary)
segments, info = model.transcribe(audio_bytesio, beam_size=5)
print("Detected language '%s' with probability %f" % (info.language, info.language_probability))
frames_sentiments = extract_frames(input_video)
transcript = ''
final_output = []
for segment in segments:
transcript = transcript + segment.text + " "
print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
transcript_segment_sentiment = analyze_sentiment(segment.text)
emotion_totals = {
'admiration': 0.0,
'amusement': 0.0,
'angry': 0.0,
'annoyance': 0.0,
'approval': 0.0,
'caring': 0.0,
'confusion': 0.0,
'curiosity': 0.0,
'desire': 0.0,
'disappointment': 0.0,
'disapproval': 0.0,
'disgust': 0.0,
'embarrassment': 0.0,
'excitement': 0.0,
'fear': 0.0,
'gratitude': 0.0,
'grief': 0.0,
'happy': 0.0,
'love': 0.0,
'nervousness': 0.0,
'optimism': 0.0,
'pride': 0.0,
'realization': 0.0,
'relief': 0.0,
'remorse': 0.0,
'sad': 0.0,
'surprise': 0.0,
'neutral': 0.0
}
counter = 0
for i in range(math.ceil(segment.start), math.floor(segment.end)):
for emotion in frames_sentiments[i].keys():
emotion_totals[emotion] += frames_sentiments[i].get(emotion)
counter += 1
for emotion in emotion_totals:
emotion_totals[emotion] /= counter
video_segment_sentiment = emotion_totals
segment_finals = {segment.id: (segment.text, segment.start, segment.end, transcript_segment_sentiment,
video_segment_sentiment)}
final_output.append(segment_finals)
print(segment_finals)
print(final_output)
print(final_output)
return final_output
gr.Interface(
fn=video_to_audio,
inputs=gr.Video(sources=["upload"]),
outputs=gr.Textbox()
).launch()