File size: 3,895 Bytes
9300af7
 
592221d
9300af7
 
 
 
592221d
9300af7
592221d
9300af7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9140841
 
592221d
9300af7
 
 
 
 
592221d
 
 
9300af7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
592221d
9300af7
592221d
9300af7
 
 
 
 
592221d
9300af7
592221d
9300af7
592221d
 
 
 
9300af7
592221d
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import math
from io import BytesIO
import gradio as gr
import cv2
import requests
from pydub import AudioSegment
from faster_whisper import WhisperModel

model = WhisperModel("small", device="cpu", compute_type="int8")

API_KEY = os.getenv("API_KEY")

FACE_API_URL = "https://api-inference.huggingface.co/models/dima806/facial_emotions_image_detection"
TEXT_API_URL = "https://api-inference.huggingface.co/models/SamLowe/roberta-base-go_emotions"
headers = {"Authorization": "Bearer " + API_KEY + ""}


def extract_frames(video_path):
    cap = cv2.VideoCapture(video_path)
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    interval = fps
    result = []

    for i in range(0, total_frames, interval):
        cap.set(cv2.CAP_PROP_POS_FRAMES, i)
        ret, frame = cap.read()
        if ret:
            _, img_encoded = cv2.imencode('.jpg', frame)
            img_bytes = img_encoded.tobytes()

            response = requests.post(FACE_API_URL, headers=headers, data=img_bytes)
            result.append({item['label']: item['score'] for item in response.json()})

    print("Frame extraction completed.")

    cap.release()
    print(result)
    return result


def analyze_sentiment(text):
    response = requests.post(TEXT_API_URL, headers=headers, json=text)
    print(response.json())
    sentiment_list = response.json()[0]
    print(sentiment_list)
    sentiment_results = {result['label']: result['score'] for result in sentiment_list}
    return sentiment_results


def video_to_audio(input_video):
    audio = AudioSegment.from_file('test_video_1.mp4')
    audio_binary = audio.export(format="wav").read()
    audio_bytesio = BytesIO(audio_binary)

    segments, info = model.transcribe(audio_bytesio, beam_size=5)

    print("Detected language '%s' with probability %f" % (info.language, info.language_probability))

    frames_sentiments = extract_frames(input_video)

    transcript = ''
    final_output = []
    for segment in segments:
        transcript = transcript + segment.text + " "
        print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
        transcript_segment_sentiment = analyze_sentiment(segment.text)

        emotion_totals = {
            'admiration': 0.0,
            'amusement': 0.0,
            'angry': 0.0,
            'annoyance': 0.0,
            'approval': 0.0,
            'caring': 0.0,
            'confusion': 0.0,
            'curiosity': 0.0,
            'desire': 0.0,
            'disappointment': 0.0,
            'disapproval': 0.0,
            'disgust': 0.0,
            'embarrassment': 0.0,
            'excitement': 0.0,
            'fear': 0.0,
            'gratitude': 0.0,
            'grief': 0.0,
            'happy': 0.0,
            'love': 0.0,
            'nervousness': 0.0,
            'optimism': 0.0,
            'pride': 0.0,
            'realization': 0.0,
            'relief': 0.0,
            'remorse': 0.0,
            'sad': 0.0,
            'surprise': 0.0,
            'neutral': 0.0
        }

        counter = 0
        for i in range(math.ceil(segment.start), math.floor(segment.end)):
            for emotion in frames_sentiments[i].keys():
                emotion_totals[emotion] += frames_sentiments[i].get(emotion)
            counter += 1

        for emotion in emotion_totals:
            emotion_totals[emotion] /= counter

        video_segment_sentiment = emotion_totals

        segment_finals = {segment.id: (segment.text, segment.start, segment.end, transcript_segment_sentiment,
                                       video_segment_sentiment)}
        final_output.append(segment_finals)
        print(segment_finals)
        print(final_output)

    print(final_output)

    return final_output


gr.Interface(
    fn=video_to_audio,
    inputs=gr.Video(sources=["upload"]),
    outputs=gr.Textbox()
).launch()