Spaces:
Sleeping
Sleeping
Duplicate from Amrrs/image-caption-with-vit-gpt2
Browse filesCo-authored-by: amrrs <[email protected]>
- .gitattributes +27 -0
- README.md +47 -0
- app.py +78 -0
- elonmusk.jpeg +0 -0
- people-walking-street-pedestrian-crossing-traffic-light-city.jpeg +0 -0
- requirements.txt +3 -0
.gitattributes
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bin.* filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
20 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
26 |
+
*.zstandard filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Image Caption With Vit Gpt2
|
3 |
+
emoji: 👀
|
4 |
+
colorFrom: pink
|
5 |
+
colorTo: pink
|
6 |
+
sdk: gradio
|
7 |
+
app_file: app.py
|
8 |
+
pinned: false
|
9 |
+
license: mit
|
10 |
+
duplicated_from: Amrrs/image-caption-with-vit-gpt2
|
11 |
+
---
|
12 |
+
|
13 |
+
# Configuration
|
14 |
+
|
15 |
+
`title`: _string_
|
16 |
+
Display title for the Space
|
17 |
+
|
18 |
+
`emoji`: _string_
|
19 |
+
Space emoji (emoji-only character allowed)
|
20 |
+
|
21 |
+
`colorFrom`: _string_
|
22 |
+
Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
|
23 |
+
|
24 |
+
`colorTo`: _string_
|
25 |
+
Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
|
26 |
+
|
27 |
+
`sdk`: _string_
|
28 |
+
Can be either `gradio`, `streamlit`, or `static`
|
29 |
+
|
30 |
+
`sdk_version` : _string_
|
31 |
+
Only applicable for `streamlit` SDK.
|
32 |
+
See [doc](https://hf.co/docs/hub/spaces) for more info on supported versions.
|
33 |
+
|
34 |
+
`app_file`: _string_
|
35 |
+
Path to your main application file (which contains either `gradio` or `streamlit` Python code, or `static` html code).
|
36 |
+
Path is relative to the root of the repository.
|
37 |
+
|
38 |
+
`models`: _List[string]_
|
39 |
+
HF model IDs (like "gpt2" or "deepset/roberta-base-squad2") used in the Space.
|
40 |
+
Will be parsed automatically from your code if not specified here.
|
41 |
+
|
42 |
+
`datasets`: _List[string]_
|
43 |
+
HF dataset IDs (like "common_voice" or "oscar-corpus/OSCAR-2109") used in the Space.
|
44 |
+
Will be parsed automatically from your code if not specified here.
|
45 |
+
|
46 |
+
`pinned`: _boolean_
|
47 |
+
Whether the Space stays on top of your list.
|
app.py
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""Image Captioning with ViT+GPT2
|
3 |
+
|
4 |
+
Automatically generated by Colaboratory.
|
5 |
+
|
6 |
+
Original file is located at
|
7 |
+
https://colab.research.google.com/drive/1P3O0gO5AUqSmM8rE9dxy2tXJ-9jkhxHz
|
8 |
+
"""
|
9 |
+
|
10 |
+
#! pip install transformers -q
|
11 |
+
|
12 |
+
#! pip install gradio -q
|
13 |
+
|
14 |
+
from PIL import Image
|
15 |
+
from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, PreTrainedTokenizerFast
|
16 |
+
import requests
|
17 |
+
|
18 |
+
model = VisionEncoderDecoderModel.from_pretrained("sachin/vit2distilgpt2")
|
19 |
+
|
20 |
+
vit_feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
|
21 |
+
|
22 |
+
tokenizer = PreTrainedTokenizerFast.from_pretrained("distilgpt2")
|
23 |
+
|
24 |
+
# url = 'https://d2gp644kobdlm6.cloudfront.net/wp-content/uploads/2016/06/bigstock-Shocked-and-surprised-boy-on-t-113798588-300x212.jpg'
|
25 |
+
|
26 |
+
# with Image.open(requests.get(url, stream=True).raw) as img:
|
27 |
+
# pixel_values = vit_feature_extractor(images=img, return_tensors="pt").pixel_values
|
28 |
+
|
29 |
+
#encoder_outputs = model.generate(pixel_values.to('cpu'),num_beams=5)
|
30 |
+
|
31 |
+
#generated_sentences = tokenizer.batch_decode(encoder_outputs, skip_special_tokens=True)
|
32 |
+
|
33 |
+
#generated_sentences
|
34 |
+
|
35 |
+
#naive text processing
|
36 |
+
#generated_sentences[0].split('.')[0]
|
37 |
+
|
38 |
+
# inference function
|
39 |
+
|
40 |
+
def vit2distilgpt2(img):
|
41 |
+
pixel_values = vit_feature_extractor(images=img, return_tensors="pt").pixel_values
|
42 |
+
encoder_outputs = generated_ids = model.generate(pixel_values.to('cpu'),num_beams=5)
|
43 |
+
generated_sentences = tokenizer.batch_decode(encoder_outputs, skip_special_tokens=True)
|
44 |
+
|
45 |
+
return(generated_sentences[0].split('.')[0])
|
46 |
+
|
47 |
+
#!wget https://media.glamour.com/photos/5f171c4fd35176eaedb36823/master/w_2560%2Cc_limit/bike.jpg
|
48 |
+
|
49 |
+
import gradio as gr
|
50 |
+
|
51 |
+
inputs = [
|
52 |
+
gr.inputs.Image(type="pil", label="Original Image")
|
53 |
+
]
|
54 |
+
|
55 |
+
outputs = [
|
56 |
+
gr.outputs.Textbox(label = 'Caption')
|
57 |
+
]
|
58 |
+
|
59 |
+
title = "Image Captioning using ViT + GPT2"
|
60 |
+
description = "ViT and GPT2 are used to generate Image Caption for the uploaded image. COCO Dataset was used for training. This image captioning model might have some biases that we couldn't figure during our stress testing, so if you find any bias (gender, race and so on) please use `Flag` button to flag the image with bias"
|
61 |
+
article = " <a href='https://huggingface.co/sachin/vit2distilgpt2'>Model Repo on Hugging Face Model Hub</a>"
|
62 |
+
examples = [
|
63 |
+
["people-walking-street-pedestrian-crossing-traffic-light-city.jpeg"],
|
64 |
+
["elonmusk.jpeg"]
|
65 |
+
|
66 |
+
]
|
67 |
+
|
68 |
+
gr.Interface(
|
69 |
+
vit2distilgpt2,
|
70 |
+
inputs,
|
71 |
+
outputs,
|
72 |
+
title=title,
|
73 |
+
description=description,
|
74 |
+
article=article,
|
75 |
+
examples=examples,
|
76 |
+
theme="huggingface",
|
77 |
+
).launch(debug=True, enable_queue=True)
|
78 |
+
|
elonmusk.jpeg
ADDED
![]() |
people-walking-street-pedestrian-crossing-traffic-light-city.jpeg
ADDED
![]() |
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
torch
|
2 |
+
transformers
|
3 |
+
Pillow
|