import streamlit as st from transformers import CLIPProcessor, CLIPModel from PIL import Image import torch # Load the pre-trained CLIP model and processor model_name = "facebook/nougat-base" needed model = CLIPModel.from_pretrained(model_name) processor = CLIPProcessor.from_pretrained(model_name) st.title("Image to Text Conversion App") # Input image upload image = st.file_uploader("Upload an image", type=["jpg", "png", "jpeg"]) if image: # Display the uploaded image st.image(image, caption="Uploaded Image", use_column_width=True) # Process the image for text conversion with torch.no_grad(): inputs = processor(text="a photo of " + st.session_state["alt_text"], images=image, return_tensors="pt") outputs = model(**inputs) # Extract the textual description text_description = processor.decode(outputs["text"]) # Display the text description st.subheader("Text Description:") st.write(text_description) # Input for alternative text alt_text = st.text_area("Provide alternative text for the image:", key="alt_text") st.write("Powered by Hugging Face's CLIP model.") streamlit run app.py