Spaces:

Tapanat
/

eyevy229352

Build error

Tapanat commited on Oct 5, 2023

Commit

140e3c2

1 Parent(s): c6a3c2c

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -2,30 +2,38 @@ import streamlit as st
 from transformers import CLIPProcessor, CLIPModel
 from PIL import Image
 import torch
-pip install --upgrade transformers
 # Load the pre-trained CLIP model and processor
-model_name = "facebook/nougat-base"
 model = CLIPModel.from_pretrained(model_name)
 processor = CLIPProcessor.from_pretrained(model_name)
-st.title("Text to Image Generation App")
-# Input text area for user input
-text = st.text_area("Enter a text description:")
-if text:
-    # Generate an image based on the user's input text
-    inputs = processor(text, return_tensors="pt")
-    with torch.no_grad():
-        image_features = model.get_image_features(**inputs)
-    image = image_features.pixel_values[0].permute(1, 2, 0).cpu().numpy()
-    image = (image - image.min()) / (image.max() - image.min())  # Normalize the image
-    image = (image * 255).astype("uint8")
-    image = Image.fromarray(image)
-    # Display the generated image
-    st.image(image, caption="Generated Image", use_column_width=True)
 st.write("Powered by Hugging Face's CLIP model.")

 from transformers import CLIPProcessor, CLIPModel
 from PIL import Image
 import torch
 # Load the pre-trained CLIP model and processor
+model_name = "dandelin/vilt-b32-finetuned-vqa"  # You can choose a different CLIP variant if needed
 model = CLIPModel.from_pretrained(model_name)
 processor = CLIPProcessor.from_pretrained(model_name)
+st.title("Visual Question Answering App")
+# Input image upload
+image = st.file_uploader("Upload an image", type=["jpg", "png", "jpeg"])
+# Input question area for user input
+question = st.text_area("Ask a question about the image:")
+if image and question:
+    # Display the uploaded image
+    st.image(image, caption="Uploaded Image", use_column_width=True)
+    # Process the image and question for VQA
+    inputs = processor(text=question, images=image, return_tensors="pt")
+    # Get the CLIP model's prediction
+    with torch.no_grad():
+        outputs = model(**inputs)
+    # Extract the textual answer
+    answer = outputs["text"]
+    # Display the answer
+    st.write("Answer:", answer)
 st.write("Powered by Hugging Face's CLIP model.")
+streamlit run app.py