import streamlit as st
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch

# Load the pre-trained CLIP model and processor
model_name = "facebook/nougat-base"   needed
model = CLIPModel.from_pretrained(model_name)
processor = CLIPProcessor.from_pretrained(model_name)

st.title("Image to Text Conversion App")

# Input image upload
image = st.file_uploader("Upload an image", type=["jpg", "png", "jpeg"])

if image:
    # Display the uploaded image
    st.image(image, caption="Uploaded Image", use_column_width=True)

    # Process the image for text conversion
    with torch.no_grad():
        inputs = processor(text="a photo of " + st.session_state["alt_text"], images=image, return_tensors="pt")
        outputs = model(**inputs)
    
    # Extract the textual description
    text_description = processor.decode(outputs["text"])
    
    # Display the text description
    st.subheader("Text Description:")
    st.write(text_description)

# Input for alternative text
alt_text = st.text_area("Provide alternative text for the image:", key="alt_text")

st.write("Powered by Hugging Face's CLIP model.")

streamlit run app.py