Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
@@ -2,30 +2,38 @@ import streamlit as st
|
|
2 |
from transformers import CLIPProcessor, CLIPModel
|
3 |
from PIL import Image
|
4 |
import torch
|
5 |
-
pip install --upgrade transformers
|
6 |
-
|
7 |
|
8 |
# Load the pre-trained CLIP model and processor
|
9 |
-
model_name = "
|
10 |
model = CLIPModel.from_pretrained(model_name)
|
11 |
processor = CLIPProcessor.from_pretrained(model_name)
|
12 |
|
13 |
-
st.title("
|
14 |
|
15 |
-
# Input
|
16 |
-
|
17 |
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
image =
|
24 |
-
image = (image - image.min()) / (image.max() - image.min()) # Normalize the image
|
25 |
-
image = (image * 255).astype("uint8")
|
26 |
-
image = Image.fromarray(image)
|
27 |
|
28 |
-
#
|
29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
st.write("Powered by Hugging Face's CLIP model.")
|
|
|
|
|
|
|
|
2 |
from transformers import CLIPProcessor, CLIPModel
|
3 |
from PIL import Image
|
4 |
import torch
|
|
|
|
|
5 |
|
6 |
# Load the pre-trained CLIP model and processor
|
7 |
+
model_name = "dandelin/vilt-b32-finetuned-vqa" # You can choose a different CLIP variant if needed
|
8 |
model = CLIPModel.from_pretrained(model_name)
|
9 |
processor = CLIPProcessor.from_pretrained(model_name)
|
10 |
|
11 |
+
st.title("Visual Question Answering App")
|
12 |
|
13 |
+
# Input image upload
|
14 |
+
image = st.file_uploader("Upload an image", type=["jpg", "png", "jpeg"])
|
15 |
|
16 |
+
# Input question area for user input
|
17 |
+
question = st.text_area("Ask a question about the image:")
|
18 |
+
|
19 |
+
if image and question:
|
20 |
+
# Display the uploaded image
|
21 |
+
st.image(image, caption="Uploaded Image", use_column_width=True)
|
|
|
|
|
|
|
22 |
|
23 |
+
# Process the image and question for VQA
|
24 |
+
inputs = processor(text=question, images=image, return_tensors="pt")
|
25 |
+
|
26 |
+
# Get the CLIP model's prediction
|
27 |
+
with torch.no_grad():
|
28 |
+
outputs = model(**inputs)
|
29 |
+
|
30 |
+
# Extract the textual answer
|
31 |
+
answer = outputs["text"]
|
32 |
+
|
33 |
+
# Display the answer
|
34 |
+
st.write("Answer:", answer)
|
35 |
|
36 |
st.write("Powered by Hugging Face's CLIP model.")
|
37 |
+
|
38 |
+
streamlit run app.py
|
39 |
+
|