Spaces:
Runtime error
Runtime error
import gradio as gr | |
import cv2 | |
import torch | |
from pipelines.pipeline import InferencePipeline | |
import time | |
from huggingface_hub import hf_hub_download | |
import os | |
class ChaplinGradio: | |
def __init__(self): | |
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
self.vsr_model = None | |
self.download_models() | |
self.load_models() | |
# Video params | |
self.fps = 16 | |
self.frame_interval = 1 / self.fps | |
self.frame_compression = 25 | |
self.last_frame_time = time.time() | |
def download_models(self): | |
"""Download required model files from HuggingFace""" | |
# Create directories if they don't exist | |
os.makedirs("benchmarks/LRS3/models/LRS3_V_WER19.1", exist_ok=True) | |
os.makedirs("benchmarks/LRS3/language_models/lm_en_subword", exist_ok=True) | |
# Download VSR model files | |
hf_hub_download(repo_id="willwade/LRS3_V_WER19.1", | |
filename="model.pth", | |
local_dir="benchmarks/LRS3/models/LRS3_V_WER19.1") | |
hf_hub_download(repo_id="willwade/LRS3_V_WER19.1", | |
filename="model.json", | |
local_dir="benchmarks/LRS3/models/LRS3_V_WER19.1") | |
# Download language model files | |
hf_hub_download(repo_id="willwade/lm_en_subword", | |
filename="model.pth", | |
local_dir="benchmarks/LRS3/language_models/lm_en_subword") | |
hf_hub_download(repo_id="willwade/lm_en_subword", | |
filename="model.json", | |
local_dir="benchmarks/LRS3/language_models/lm_en_subword") | |
print("Models downloaded successfully!") | |
def load_models(self): | |
"""Load models using the InferencePipeline with LRS3 config""" | |
config_path = "configs/LRS3_V_WER19.1.ini" | |
self.vsr_model = InferencePipeline( | |
config_path, | |
device=self.device, | |
detector="mediapipe", | |
face_track=True | |
) | |
print("Model loaded successfully!") | |
def process_frame(self, frame): | |
"""Process a single frame with rate limiting and compression""" | |
current_time = time.time() | |
if current_time - self.last_frame_time < self.frame_interval: | |
return None | |
self.last_frame_time = current_time | |
if frame is None: | |
return "No video input detected" | |
# Compress frame | |
encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), self.frame_compression] | |
_, buffer = cv2.imencode('.jpg', frame, encode_param) | |
compressed_frame = cv2.imdecode(buffer, cv2.IMREAD_GRAYSCALE) | |
# Run inference using the VSR model | |
predicted_text = self.vsr_model.process_frame(compressed_frame) | |
return predicted_text | |
# Create Gradio interface | |
chaplin = ChaplinGradio() | |
iface = gr.Interface( | |
fn=chaplin.process_frame, | |
inputs=gr.Image(source="webcam", streaming=True), | |
outputs=gr.Textbox(label="Predicted Text"), | |
title="Chaplin - Live Visual Speech Recognition", | |
description="Use your webcam to perform real-time visual speech recognition.", | |
live=True | |
) | |
if __name__ == "__main__": | |
iface.launch() |