Test_Video / app.py
ParthCodes's picture
Update app.py
9300af7 verified
raw
history blame
3.9 kB
import math
from io import BytesIO
import gradio as gr
import cv2
import requests
from pydub import AudioSegment
from faster_whisper import WhisperModel
model = WhisperModel("small", device="cpu", compute_type="int8")
API_KEY = os.getenv("API_KEY")
FACE_API_URL = "https://api-inference.huggingface.co/models/dima806/facial_emotions_image_detection"
TEXT_API_URL = "https://api-inference.huggingface.co/models/SamLowe/roberta-base-go_emotions"
headers = {"Authorization": "Bearer " + API_KEY + ""}
def extract_frames(video_path):
cap = cv2.VideoCapture(video_path)
fps = int(cap.get(cv2.CAP_PROP_FPS))
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
interval = fps
result = []
for i in range(0, total_frames, interval):
cap.set(cv2.CAP_PROP_POS_FRAMES, i)
ret, frame = cap.read()
if ret:
_, img_encoded = cv2.imencode('.jpg', frame)
img_bytes = img_encoded.tobytes()
response = requests.post(FACE_API_URL, headers=headers, data=img_bytes)
result.append({item['label']: item['score'] for item in response.json()})
print("Frame extraction completed.")
cap.release()
print(result)
return result
def analyze_sentiment(text):
response = requests.post(TEXT_API_URL, headers=headers, json=text)
print(response.json())
sentiment_list = response.json()[0]
print(sentiment_list)
sentiment_results = {result['label']: result['score'] for result in sentiment_list}
return sentiment_results
def video_to_audio(input_video):
audio = AudioSegment.from_file('test_video_1.mp4')
audio_binary = audio.export(format="wav").read()
audio_bytesio = BytesIO(audio_binary)
segments, info = model.transcribe(audio_bytesio, beam_size=5)
print("Detected language '%s' with probability %f" % (info.language, info.language_probability))
frames_sentiments = extract_frames(input_video)
transcript = ''
final_output = []
for segment in segments:
transcript = transcript + segment.text + " "
print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
transcript_segment_sentiment = analyze_sentiment(segment.text)
emotion_totals = {
'admiration': 0.0,
'amusement': 0.0,
'angry': 0.0,
'annoyance': 0.0,
'approval': 0.0,
'caring': 0.0,
'confusion': 0.0,
'curiosity': 0.0,
'desire': 0.0,
'disappointment': 0.0,
'disapproval': 0.0,
'disgust': 0.0,
'embarrassment': 0.0,
'excitement': 0.0,
'fear': 0.0,
'gratitude': 0.0,
'grief': 0.0,
'happy': 0.0,
'love': 0.0,
'nervousness': 0.0,
'optimism': 0.0,
'pride': 0.0,
'realization': 0.0,
'relief': 0.0,
'remorse': 0.0,
'sad': 0.0,
'surprise': 0.0,
'neutral': 0.0
}
counter = 0
for i in range(math.ceil(segment.start), math.floor(segment.end)):
for emotion in frames_sentiments[i].keys():
emotion_totals[emotion] += frames_sentiments[i].get(emotion)
counter += 1
for emotion in emotion_totals:
emotion_totals[emotion] /= counter
video_segment_sentiment = emotion_totals
segment_finals = {segment.id: (segment.text, segment.start, segment.end, transcript_segment_sentiment,
video_segment_sentiment)}
final_output.append(segment_finals)
print(segment_finals)
print(final_output)
print(final_output)
return final_output
gr.Interface(
fn=video_to_audio,
inputs=gr.Video(sources=["upload"]),
outputs=gr.Textbox()
).launch()