willwade commited on
Commit
5b30a24
·
verified ·
1 Parent(s): 9a1700f

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +74 -0
app.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import cv2
3
+ import torch
4
+ from pipelines.pipeline import InferencePipeline
5
+ import time
6
+
7
+
8
+ class ChaplinGradio:
9
+ def __init__(self):
10
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
11
+ self.vsr_model = None
12
+ self.load_models()
13
+
14
+ # Video params
15
+ self.fps = 16
16
+ self.frame_interval = 1 / self.fps
17
+ self.frame_compression = 25
18
+ self.last_frame_time = time.time()
19
+
20
+ def load_models(self):
21
+ """Load models using the InferencePipeline with HF Space defaults"""
22
+ config = {
23
+ "model": {
24
+ "name": "chaplin_vsr",
25
+ "weights": "models/chaplin_vsr.pth",
26
+ "detector": "mediapipe"
27
+ }
28
+ }
29
+
30
+ self.vsr_model = InferencePipeline(
31
+ config,
32
+ device=self.device,
33
+ detector="mediapipe",
34
+ face_track=True
35
+ )
36
+ print("Model loaded successfully!")
37
+
38
+ def process_frame(self, frame):
39
+ """Process a single frame with rate limiting and compression"""
40
+ current_time = time.time()
41
+
42
+ if current_time - self.last_frame_time < self.frame_interval:
43
+ return None
44
+
45
+ self.last_frame_time = current_time
46
+
47
+ if frame is None:
48
+ return "No video input detected"
49
+
50
+ # Compress frame
51
+ encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), self.frame_compression]
52
+ _, buffer = cv2.imencode('.jpg', frame, encode_param)
53
+ compressed_frame = cv2.imdecode(buffer, cv2.IMREAD_GRAYSCALE)
54
+
55
+ # Run inference using the VSR model
56
+ predicted_text = self.vsr_model.process_frame(compressed_frame)
57
+
58
+ return predicted_text
59
+
60
+
61
+ # Create Gradio interface
62
+ chaplin = ChaplinGradio()
63
+
64
+ iface = gr.Interface(
65
+ fn=chaplin.process_frame,
66
+ inputs=gr.Image(source="webcam", streaming=True),
67
+ outputs=gr.Textbox(label="Predicted Text"),
68
+ title="Chaplin - Live Visual Speech Recognition",
69
+ description="Use your webcam to perform real-time visual speech recognition.",
70
+ live=True
71
+ )
72
+
73
+ if __name__ == "__main__":
74
+ iface.launch()