import json
import gradio as gr
import torch

from transformers import EfficientFormerImageProcessor, EfficientFormerForImageClassificationWithTeacher

# Load preprocessor and pretrained model
model_name = "snap-research/efficientformer-l7-300"
processor = EfficientFormerImageProcessor.from_pretrained(model_name)
model = EfficientFormerForImageClassificationWithTeacher.from_pretrained(model_name)


# Load ImageNet idx to label mapping
with open("assets/imagenet_1000_idx2labels.json") as f:
    idx_to_label = json.load(f)


def classify_image(img, top_k):
    # Preprocess input image
    inputs = processor(images=img, return_tensors="pt")

    # Inference
    with torch.no_grad():
        outputs = model(**inputs)

    # Print the top ImageNet1k class prediction 
    logits = outputs.logits
    scores = torch.nn.functional.softmax(logits, dim=1)

    top_k_labels = scores.argsort(descending=True)[0][:top_k].cpu().detach().numpy()
    top_k_labels = list(top_k_labels)

    return {idx_to_label[str(idx)] : round(float(scores[0, idx]), 4) for idx in top_k_labels}


description = """
Gradio demo for <a href="https://huggingface.co/docs/transformers/main/en/model_doc/efficientformer">EfficientFormer</a>, 
introduced in <a href="https://arxiv.org/abs/2206.01191">EfficientFormer: Vision Transformers at MobileNet Speed</a>. 
\n\nEfficientFormer is a mobile-friendly image classification model that achieves MobileNet inference speed with impressive performance gains. 
To use it, simply upload an image and print the top predictions.
"""

demo = gr.Interface(
    classify_image, 
    inputs=[gr.Image(), gr.Slider(0, 1000, value=5)], 
    outputs=gr.outputs.Label(),
    description=description,
    title="Image Classification with EfficientFormer-L1",
    examples=[
        ["assets/halloween-gaf8ad7ebc_1920.jpeg", 5],
        ["assets/IMG_4484.jpeg", 5],
        ["assets/IMG_4737.jpeg", 5],
        ["assets/IMG_4740.jpeg", 5],
    ],
)
demo.launch()