import streamlit as st from transformers import AutoProcessor, AutoModelForImageTextToText from PIL import Image import torch # Load model and processor @st.cache_resource # Cache model to avoid reloading def load_model(): processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct") model = AutoModelForImageTextToText.from_pretrained("HuggingFaceTB/SmolVLM-Instruct") return processor, model # Extract text from image using SmolVLM def extract_text(image, processor, model): # Preprocess image inputs = processor(images=image, text="What is the text in this image? extract all data in JSON format", return_tensors="pt") with torch.no_grad(): outputs = model.generate(**inputs) result = processor.batch_decode(outputs, skip_special_tokens=True)[0] return result # Streamlit UI def main(): st.title("🖼️ OCR App using SmolVLM") st.write("Upload an image, and I will extract the text for you!") # Load the model and processor processor, model = load_model() # File uploader uploaded_file = st.file_uploader("Upload an Image", type=["jpg", "jpeg", "png"]) if uploaded_file is not None: # Open image image = Image.open(uploaded_file).convert("RGB") st.image(image, caption="Uploaded Image", use_column_width=True) # Extract text with st.spinner("Extracting text..."): extracted_text = extract_text(image, processor, model) # Display result st.subheader("📝 Extracted Text:") st.write(extracted_text) if __name__ == "__main__": main()