import gradio as gr import cv2 import torch from pipelines.pipeline import InferencePipeline import time class ChaplinGradio: def __init__(self): self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.vsr_model = None self.load_models() # Video params self.fps = 16 self.frame_interval = 1 / self.fps self.frame_compression = 25 self.last_frame_time = time.time() def load_models(self): """Load models using the InferencePipeline with HF Space defaults""" config = { "model": { "name": "chaplin_vsr", "weights": "models/chaplin_vsr.pth", "detector": "mediapipe" } } self.vsr_model = InferencePipeline( config, device=self.device, detector="mediapipe", face_track=True ) print("Model loaded successfully!") def process_frame(self, frame): """Process a single frame with rate limiting and compression""" current_time = time.time() if current_time - self.last_frame_time < self.frame_interval: return None self.last_frame_time = current_time if frame is None: return "No video input detected" # Compress frame encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), self.frame_compression] _, buffer = cv2.imencode('.jpg', frame, encode_param) compressed_frame = cv2.imdecode(buffer, cv2.IMREAD_GRAYSCALE) # Run inference using the VSR model predicted_text = self.vsr_model.process_frame(compressed_frame) return predicted_text # Create Gradio interface chaplin = ChaplinGradio() iface = gr.Interface( fn=chaplin.process_frame, inputs=gr.Image(source="webcam", streaming=True), outputs=gr.Textbox(label="Predicted Text"), title="Chaplin - Live Visual Speech Recognition", description="Use your webcam to perform real-time visual speech recognition.", live=True ) if __name__ == "__main__": iface.launch()