import streamlit as st
from transformers import AutoProcessor, AutoModelForImageTextToText
from PIL import Image
import torch

# Load model and processor
@st.cache_resource  # Cache model to avoid reloading
def load_model():
    processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct")
    model = AutoModelForImageTextToText.from_pretrained("HuggingFaceTB/SmolVLM-Instruct")
    return processor, model

# Extract text from image using SmolVLM
def extract_text(image, processor, model):
    # Preprocess image
    inputs = processor(images=image, text="What is the text in this image? extract all data in JSON format", return_tensors="pt")

    with torch.no_grad():
        outputs = model.generate(**inputs)

    result = processor.batch_decode(outputs, skip_special_tokens=True)[0]
    return result

# Streamlit UI
def main():
    st.title("🖼️ OCR App using SmolVLM")
    st.write("Upload an image, and I will extract the text for you!")

    # Load the model and processor
    processor, model = load_model()

    # File uploader
    uploaded_file = st.file_uploader("Upload an Image", type=["jpg", "jpeg", "png"])

    if uploaded_file is not None:
        # Open image
        image = Image.open(uploaded_file).convert("RGB")
        st.image(image, caption="Uploaded Image", use_column_width=True)

        # Extract text
        with st.spinner("Extracting text..."):
            extracted_text = extract_text(image, processor, model)

        # Display result
        st.subheader("📝 Extracted Text:")
        st.write(extracted_text)

if __name__ == "__main__":
    main()