import gradio as gr import cv2 import torch from pipelines.pipeline import InferencePipeline import time from huggingface_hub import hf_hub_download import os class ChaplinGradio: def __init__(self): self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.vsr_model = None self.download_models() self.load_models() # Video params self.fps = 16 self.frame_interval = 1 / self.fps self.frame_compression = 25 self.last_frame_time = time.time() def download_models(self): """Download required model files from HuggingFace""" # Create directories if they don't exist os.makedirs("benchmarks/LRS3/models/LRS3_V_WER19.1", exist_ok=True) os.makedirs("benchmarks/LRS3/language_models/lm_en_subword", exist_ok=True) # Download VSR model files hf_hub_download(repo_id="willwade/LRS3_V_WER19.1", filename="model.pth", local_dir="benchmarks/LRS3/models/LRS3_V_WER19.1") hf_hub_download(repo_id="willwade/LRS3_V_WER19.1", filename="model.json", local_dir="benchmarks/LRS3/models/LRS3_V_WER19.1") # Download language model files hf_hub_download(repo_id="willwade/lm_en_subword", filename="model.pth", local_dir="benchmarks/LRS3/language_models/lm_en_subword") hf_hub_download(repo_id="willwade/lm_en_subword", filename="model.json", local_dir="benchmarks/LRS3/language_models/lm_en_subword") print("Models downloaded successfully!") def load_models(self): """Load models using the InferencePipeline with LRS3 config""" config_path = "configs/LRS3_V_WER19.1.ini" self.vsr_model = InferencePipeline( config_path, device=self.device, detector="mediapipe", face_track=True ) print("Model loaded successfully!") def process_frame(self, frame): """Process a single frame with rate limiting and compression""" current_time = time.time() if current_time - self.last_frame_time < self.frame_interval: return None self.last_frame_time = current_time if frame is None: return "No video input detected" # Compress frame encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), self.frame_compression] _, buffer = cv2.imencode('.jpg', frame, encode_param) compressed_frame = cv2.imdecode(buffer, cv2.IMREAD_GRAYSCALE) # Run inference using the VSR model predicted_text = self.vsr_model.process_frame(compressed_frame) return predicted_text # Create Gradio interface chaplin = ChaplinGradio() iface = gr.Interface( fn=chaplin.process_frame, inputs=gr.Image(source="webcam", streaming=True), outputs=gr.Textbox(label="Predicted Text"), title="Chaplin - Live Visual Speech Recognition", description="Use your webcam to perform real-time visual speech recognition.", live=True ) if __name__ == "__main__": iface.launch()