Tapanat commited on
Commit
140e3c2
·
1 Parent(s): c6a3c2c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -17
app.py CHANGED
@@ -2,30 +2,38 @@ import streamlit as st
2
  from transformers import CLIPProcessor, CLIPModel
3
  from PIL import Image
4
  import torch
5
- pip install --upgrade transformers
6
-
7
 
8
  # Load the pre-trained CLIP model and processor
9
- model_name = "facebook/nougat-base"
10
  model = CLIPModel.from_pretrained(model_name)
11
  processor = CLIPProcessor.from_pretrained(model_name)
12
 
13
- st.title("Text to Image Generation App")
14
 
15
- # Input text area for user input
16
- text = st.text_area("Enter a text description:")
17
 
18
- if text:
19
- # Generate an image based on the user's input text
20
- inputs = processor(text, return_tensors="pt")
21
- with torch.no_grad():
22
- image_features = model.get_image_features(**inputs)
23
- image = image_features.pixel_values[0].permute(1, 2, 0).cpu().numpy()
24
- image = (image - image.min()) / (image.max() - image.min()) # Normalize the image
25
- image = (image * 255).astype("uint8")
26
- image = Image.fromarray(image)
27
 
28
- # Display the generated image
29
- st.image(image, caption="Generated Image", use_column_width=True)
 
 
 
 
 
 
 
 
 
 
30
 
31
  st.write("Powered by Hugging Face's CLIP model.")
 
 
 
 
2
  from transformers import CLIPProcessor, CLIPModel
3
  from PIL import Image
4
  import torch
 
 
5
 
6
  # Load the pre-trained CLIP model and processor
7
+ model_name = "dandelin/vilt-b32-finetuned-vqa" # You can choose a different CLIP variant if needed
8
  model = CLIPModel.from_pretrained(model_name)
9
  processor = CLIPProcessor.from_pretrained(model_name)
10
 
11
+ st.title("Visual Question Answering App")
12
 
13
+ # Input image upload
14
+ image = st.file_uploader("Upload an image", type=["jpg", "png", "jpeg"])
15
 
16
+ # Input question area for user input
17
+ question = st.text_area("Ask a question about the image:")
18
+
19
+ if image and question:
20
+ # Display the uploaded image
21
+ st.image(image, caption="Uploaded Image", use_column_width=True)
 
 
 
22
 
23
+ # Process the image and question for VQA
24
+ inputs = processor(text=question, images=image, return_tensors="pt")
25
+
26
+ # Get the CLIP model's prediction
27
+ with torch.no_grad():
28
+ outputs = model(**inputs)
29
+
30
+ # Extract the textual answer
31
+ answer = outputs["text"]
32
+
33
+ # Display the answer
34
+ st.write("Answer:", answer)
35
 
36
  st.write("Powered by Hugging Face's CLIP model.")
37
+
38
+ streamlit run app.py
39
+