ParthCodes commited on
Commit
9300af7
·
verified ·
1 Parent(s): 18a29c8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +107 -32
app.py CHANGED
@@ -1,51 +1,126 @@
1
- import json
2
- import ffmpeg
3
- from subprocess import run
4
  import gradio as gr
5
- import uuid
6
- import os
7
- import stat
8
- from zipfile import ZipFile
9
- import whisper_timestamped as whisper
10
- from transformers import pipeline
11
 
12
- model = whisper.load_model("small", device="cpu")
13
- sentiment_analysis = pipeline("sentiment-analysis", framework="pt", model="SamLowe/roberta-base-go_emotions", use_fast=True)
14
 
15
- ZipFile("ffmpeg.zip").extractall()
16
- st = os.stat('ffmpeg')
17
- os.chmod('ffmpeg', st.st_mode | stat.S_IEXEC)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
- print("cwd", os.getcwd())
20
- print(os.listdir())
21
 
22
  def analyze_sentiment(text):
23
- results = sentiment_analysis(text)
24
- sentiment_results = {result['label']: result['score'] for result in results}
 
 
 
25
  return sentiment_results
26
 
27
 
28
- def transcribe(audio):
29
- audio = whisper.load_audio(audio)
30
- result = whisper.transcribe(model, audio)
31
- print(json.dumps(result, indent=2, ensure_ascii=False))
32
- sent_res = analyze_sentiment(result.text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
- return sent_res
35
 
 
 
 
 
 
36
 
37
- def video_to_audio(input_video, output_audio):
38
- current_path = os.getcwd()
39
- common_uuid = uuid.uuid4()
40
- audio_file = f"{common_uuid}.wav"
41
- run(["ffmpeg", "-i", 'test_video_1.mp4', audio_file])
42
 
43
- response = transcribe(audio=audio_file)
44
 
45
- return response
46
 
47
  gr.Interface(
48
  fn=video_to_audio,
49
- inputs=gr.Video(),
50
  outputs=gr.Textbox()
51
  ).launch()
 
1
+ import math
2
+ from io import BytesIO
 
3
  import gradio as gr
4
+ import cv2
5
+ import requests
6
+ from pydub import AudioSegment
7
+ from faster_whisper import WhisperModel
 
 
8
 
9
+ model = WhisperModel("small", device="cpu", compute_type="int8")
 
10
 
11
+ API_KEY = os.getenv("API_KEY")
12
+
13
+ FACE_API_URL = "https://api-inference.huggingface.co/models/dima806/facial_emotions_image_detection"
14
+ TEXT_API_URL = "https://api-inference.huggingface.co/models/SamLowe/roberta-base-go_emotions"
15
+ headers = {"Authorization": "Bearer " + API_KEY + ""}
16
+
17
+
18
+ def extract_frames(video_path):
19
+ cap = cv2.VideoCapture(video_path)
20
+ fps = int(cap.get(cv2.CAP_PROP_FPS))
21
+ total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
22
+ interval = fps
23
+ result = []
24
+
25
+ for i in range(0, total_frames, interval):
26
+ cap.set(cv2.CAP_PROP_POS_FRAMES, i)
27
+ ret, frame = cap.read()
28
+ if ret:
29
+ _, img_encoded = cv2.imencode('.jpg', frame)
30
+ img_bytes = img_encoded.tobytes()
31
+
32
+ response = requests.post(FACE_API_URL, headers=headers, data=img_bytes)
33
+ result.append({item['label']: item['score'] for item in response.json()})
34
+
35
+ print("Frame extraction completed.")
36
+
37
+ cap.release()
38
+ print(result)
39
+ return result
40
 
 
 
41
 
42
  def analyze_sentiment(text):
43
+ response = requests.post(TEXT_API_URL, headers=headers, json=text)
44
+ print(response.json())
45
+ sentiment_list = response.json()[0]
46
+ print(sentiment_list)
47
+ sentiment_results = {result['label']: result['score'] for result in sentiment_list}
48
  return sentiment_results
49
 
50
 
51
+ def video_to_audio(input_video):
52
+ audio = AudioSegment.from_file('test_video_1.mp4')
53
+ audio_binary = audio.export(format="wav").read()
54
+ audio_bytesio = BytesIO(audio_binary)
55
+
56
+ segments, info = model.transcribe(audio_bytesio, beam_size=5)
57
+
58
+ print("Detected language '%s' with probability %f" % (info.language, info.language_probability))
59
+
60
+ frames_sentiments = extract_frames(input_video)
61
+
62
+ transcript = ''
63
+ final_output = []
64
+ for segment in segments:
65
+ transcript = transcript + segment.text + " "
66
+ print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
67
+ transcript_segment_sentiment = analyze_sentiment(segment.text)
68
+
69
+ emotion_totals = {
70
+ 'admiration': 0.0,
71
+ 'amusement': 0.0,
72
+ 'angry': 0.0,
73
+ 'annoyance': 0.0,
74
+ 'approval': 0.0,
75
+ 'caring': 0.0,
76
+ 'confusion': 0.0,
77
+ 'curiosity': 0.0,
78
+ 'desire': 0.0,
79
+ 'disappointment': 0.0,
80
+ 'disapproval': 0.0,
81
+ 'disgust': 0.0,
82
+ 'embarrassment': 0.0,
83
+ 'excitement': 0.0,
84
+ 'fear': 0.0,
85
+ 'gratitude': 0.0,
86
+ 'grief': 0.0,
87
+ 'happy': 0.0,
88
+ 'love': 0.0,
89
+ 'nervousness': 0.0,
90
+ 'optimism': 0.0,
91
+ 'pride': 0.0,
92
+ 'realization': 0.0,
93
+ 'relief': 0.0,
94
+ 'remorse': 0.0,
95
+ 'sad': 0.0,
96
+ 'surprise': 0.0,
97
+ 'neutral': 0.0
98
+ }
99
+
100
+ counter = 0
101
+ for i in range(math.ceil(segment.start), math.floor(segment.end)):
102
+ for emotion in frames_sentiments[i].keys():
103
+ emotion_totals[emotion] += frames_sentiments[i].get(emotion)
104
+ counter += 1
105
+
106
+ for emotion in emotion_totals:
107
+ emotion_totals[emotion] /= counter
108
 
109
+ video_segment_sentiment = emotion_totals
110
 
111
+ segment_finals = {segment.id: (segment.text, segment.start, segment.end, transcript_segment_sentiment,
112
+ video_segment_sentiment)}
113
+ final_output.append(segment_finals)
114
+ print(segment_finals)
115
+ print(final_output)
116
 
117
+ print(final_output)
 
 
 
 
118
 
119
+ return final_output
120
 
 
121
 
122
  gr.Interface(
123
  fn=video_to_audio,
124
+ inputs=gr.Video(sources=["upload"]),
125
  outputs=gr.Textbox()
126
  ).launch()