import gradio as gr
import cv2
import numpy as np
from tensorflow.keras.models import load_model
import mediapipe as mp

# Load your label to alphabet mapping
from config import label_to_alphabet  # Ensure this file has the correct mapping

# Load the saved ASL model
model = load_model("model/asl_model.h5")

# Initialize MediaPipe for hand detection
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=1, min_detection_confidence=0.5)
mp_drawing = mp.solutions.drawing_utils  # For hand landmark drawing

def detect_and_crop_hand(image):
    """
    Detect the hand in the image, crop the region, and return the cropped hand image.
    """
    # Convert the image to RGB format (required by MediaPipe)
    rgb_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    
    # Detect hand landmarks
    results = hands.process(rgb_image)
    
    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            # Get the image dimensions
            h, w, _ = image.shape
            x_min = w
            y_min = h
            x_max = y_max = 0
            
            # Loop through landmarks to determine bounding box for the hand
            for landmark in hand_landmarks.landmark:
                x, y = int(landmark.x * w), int(landmark.y * h)
                x_min = min(x, x_min)
                y_min = min(y, y_min)
                x_max = max(x, x_max)
                y_max = max(y, y_max)
            
            # Crop the hand portion from the image
            cropped_hand = image[y_min:y_max, x_min:x_max]
            
            # Optional: Draw the landmarks on the original image for visualization (for debugging)
            mp_drawing.draw_landmarks(image, hand_landmarks, mp_hands.HAND_CONNECTIONS)
            
            return cropped_hand
    
    # If no hand is detected, return None
    return None

def preprocess_hand_image(hand_image):
    """
    Preprocess the cropped hand image for the ASL recognition model.
    This involves resizing, normalizing, and reshaping the image.
    """
    # Resize the image to 150x150 pixels (or your model's input size)
    hand_image_resized = cv2.resize(hand_image, (150, 150))
    
    # Normalize the image (scale pixel values to [0, 1])
    hand_image_normalized = hand_image_resized / 255.0
    
    # Reshape the image to match the model's expected input shape (1, 150, 150, 3)
    hand_image_reshaped = np.expand_dims(hand_image_normalized, axis=0)
    
    return hand_image_reshaped

def predict_asl_alphabet(cropped_hand):
    """
    Feed the cropped hand image into the ASL recognition model and return the predicted alphabet.
    """
    # Preprocess the hand image
    processed_hand = preprocess_hand_image(cropped_hand)
    
    # Make the prediction using the ASL model
    predictions = model.predict(processed_hand)
    
    # Get the predicted label (the index of the highest predicted probability)
    predicted_label = np.argmax(predictions[0])
    
    # Map the label to the corresponding alphabet
    predicted_alphabet = label_to_alphabet[predicted_label]
    
    return predicted_alphabet

# Gradio interface function
def process_video_frame(image):
    """
    Process the webcam feed to detect, crop the hand, and predict the ASL alphabet.
    """
    # Detect and crop the hand from the image
    cropped_hand = detect_and_crop_hand(image)
    
    if cropped_hand is None:
        return "No hand detected"
    
    # Predict the ASL alphabet using the cropped hand image
    predicted_alphabet = predict_asl_alphabet(cropped_hand)
    
    return predicted_alphabet  # Return the predicted alphabet

# Gradio interface setup
iface = gr.Interface(
    fn=process_video_frame, 
    inputs=gr.Image(sources=["webcam"], streaming=True),  # Webcam input
    outputs="text",  # Display the predicted alphabet
    live=True,  # Enable live video streaming
    description="Real-Time ASL Hand Gesture Recognition"
)

# Launch the Gradio app
iface.launch()