import gradio as gr
import cv2
import torch
from pipelines.pipeline import InferencePipeline
import time
from huggingface_hub import hf_hub_download
import os


class ChaplinGradio:
    def __init__(self):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.vsr_model = None
        self.download_models()
        self.load_models()
        
        # Video params
        self.fps = 16
        self.frame_interval = 1 / self.fps
        self.frame_compression = 25
        self.last_frame_time = time.time()

    def download_models(self):
        """Download required model files from HuggingFace"""
        # Create directories if they don't exist
        os.makedirs("benchmarks/LRS3/models/LRS3_V_WER19.1", exist_ok=True)
        os.makedirs("benchmarks/LRS3/language_models/lm_en_subword", exist_ok=True)
        
        # Download VSR model files
        hf_hub_download(repo_id="willwade/LRS3_V_WER19.1", 
                       filename="model.pth",
                       local_dir="benchmarks/LRS3/models/LRS3_V_WER19.1")
        hf_hub_download(repo_id="willwade/LRS3_V_WER19.1", 
                       filename="model.json",
                       local_dir="benchmarks/LRS3/models/LRS3_V_WER19.1")
        
        # Download language model files
        hf_hub_download(repo_id="willwade/lm_en_subword", 
                       filename="model.pth",
                       local_dir="benchmarks/LRS3/language_models/lm_en_subword")
        hf_hub_download(repo_id="willwade/lm_en_subword", 
                       filename="model.json",
                       local_dir="benchmarks/LRS3/language_models/lm_en_subword")
        
        print("Models downloaded successfully!")

    def load_models(self):
        """Load models using the InferencePipeline with LRS3 config"""
        config_path = "configs/LRS3_V_WER19.1.ini"
        
        self.vsr_model = InferencePipeline(
            config_path,
            device=self.device,
            detector="mediapipe",
            face_track=True
        )
        print("Model loaded successfully!")

    def process_frame(self, frame):
        """Process a single frame with rate limiting and compression"""
        current_time = time.time()
        
        if current_time - self.last_frame_time < self.frame_interval:
            return None
            
        self.last_frame_time = current_time
        
        if frame is None:
            return "No video input detected"
        
        # Compress frame
        encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), self.frame_compression]
        _, buffer = cv2.imencode('.jpg', frame, encode_param)
        compressed_frame = cv2.imdecode(buffer, cv2.IMREAD_GRAYSCALE)
        
        # Run inference using the VSR model
        predicted_text = self.vsr_model.process_frame(compressed_frame)
        
        return predicted_text


# Create Gradio interface
chaplin = ChaplinGradio()

iface = gr.Interface(
    fn=chaplin.process_frame,
    inputs=gr.Image(source="webcam", streaming=True),
    outputs=gr.Textbox(label="Predicted Text"),
    title="Chaplin - Live Visual Speech Recognition",
    description="Use your webcam to perform real-time visual speech recognition.",
    live=True
)

if __name__ == "__main__":
    iface.launch()