Spaces:
Runtime error
Runtime error
Commit
·
54f523d
1
Parent(s):
4793a6a
add code
Browse files- .idea/.gitignore +3 -0
- .idea/OFA-Image_Caption.iml +8 -0
- .idea/inspectionProfiles/profiles_settings.xml +6 -0
- .idea/misc.xml +4 -0
- .idea/modules.xml +8 -0
- .idea/vcs.xml +6 -0
- README.md +102 -12
- app.py +112 -0
- requirements.txt +5 -0
.idea/.gitignore
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Default ignored files
|
| 2 |
+
/shelf/
|
| 3 |
+
/workspace.xml
|
.idea/OFA-Image_Caption.iml
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
| 2 |
+
<module type="PYTHON_MODULE" version="4">
|
| 3 |
+
<component name="NewModuleRootManager">
|
| 4 |
+
<content url="file://$MODULE_DIR$" />
|
| 5 |
+
<orderEntry type="jdk" jdkName="Python 3.7 (py37)" jdkType="Python SDK" />
|
| 6 |
+
<orderEntry type="sourceFolder" forTests="false" />
|
| 7 |
+
</component>
|
| 8 |
+
</module>
|
.idea/inspectionProfiles/profiles_settings.xml
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<component name="InspectionProjectProfileManager">
|
| 2 |
+
<settings>
|
| 3 |
+
<option name="USE_PROJECT_PROFILE" value="false" />
|
| 4 |
+
<version value="1.0" />
|
| 5 |
+
</settings>
|
| 6 |
+
</component>
|
.idea/misc.xml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
| 2 |
+
<project version="4">
|
| 3 |
+
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (py37)" project-jdk-type="Python SDK" />
|
| 4 |
+
</project>
|
.idea/modules.xml
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
| 2 |
+
<project version="4">
|
| 3 |
+
<component name="ProjectModuleManager">
|
| 4 |
+
<modules>
|
| 5 |
+
<module fileurl="file://$PROJECT_DIR$/.idea/OFA-Image_Caption.iml" filepath="$PROJECT_DIR$/.idea/OFA-Image_Caption.iml" />
|
| 6 |
+
</modules>
|
| 7 |
+
</component>
|
| 8 |
+
</project>
|
.idea/vcs.xml
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
| 2 |
+
<project version="4">
|
| 3 |
+
<component name="VcsDirectoryMappings">
|
| 4 |
+
<mapping directory="$PROJECT_DIR$" vcs="Git" />
|
| 5 |
+
</component>
|
| 6 |
+
</project>
|
README.md
CHANGED
|
@@ -1,12 +1,102 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# OFA
|
| 2 |
+
|
| 3 |
+
[[Paper]](http://arxiv.org/abs/2202.03052) [Blog] [[Colab](colab.md)]
|
| 4 |
+
|
| 5 |
+

|
| 6 |
+
|
| 7 |
+
OFA is a unified multimodal pretrained model that unifies modalities (i.e., cross-modality, vision, language) and tasks
|
| 8 |
+
(e.g., image generation, visual grounding, image captioning, image classification, text generation, etc.)
|
| 9 |
+
to a simple sequence-to-sequence learning framework. For more information, please refer to our paper: [Unifying Architectures, Tasks, and Modalities Through a Simple Sequence-to-Sequence Learning Framework](http://arxiv.org/abs/2202.03052).
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
## News
|
| 13 |
+
* 2022.2.11: Released the Colab notebook for image captioning [![][colab]](https://colab.research.google.com/drive/1Q4eNhhhLcgOP4hHqwZwU1ijOlabgve1W?usp=sharing). Enjoy!
|
| 14 |
+
* 2022.2.11: Released the pretrained checkpoint of OFA-Large and the complete (2-staged) finetuning code for image captioning.
|
| 15 |
+
* 2022.2.10: Released the inference code & finetuned checkpoint for image captioning, which can reproduce **the results on COCO Karparthy test split (149.6 CIDEr)**
|
| 16 |
+
|
| 17 |
+
[colab]: <https://colab.research.google.com/assets/colab-badge.svg>
|
| 18 |
+
|
| 19 |
+
## TODO
|
| 20 |
+
* To release finetuning and inference codes for multimodal downstream tasks soon, including image captioning, VQA, text-to-image generation, SNLI-VE, Referring expression, comprehension, etc.
|
| 21 |
+
* To release codes for pretraining soon.
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
## Approach
|
| 25 |
+

|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
## Requirements
|
| 29 |
+
* python 3.7.4
|
| 30 |
+
* pytorch 1.8.1
|
| 31 |
+
* JAVA 1.8 (for COCO evaluation)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
## Installation
|
| 35 |
+
```bash
|
| 36 |
+
git clone https://github.com/OFA-Sys/OFA
|
| 37 |
+
pip install -r requirements.txt
|
| 38 |
+
```
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
## Datasets and Checkpoints
|
| 42 |
+
See [datasets.md](datasets.md) and [checkpoints.md](checkpoints.md).
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
## Pretraining
|
| 46 |
+
To release soon:)
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
# Finetuning & Inference
|
| 50 |
+
Below we provide methods for fintuning and inference on different downstream tasks.
|
| 51 |
+
## Caption
|
| 52 |
+
1. Download data and files and put them in the correct directory
|
| 53 |
+
2. Train
|
| 54 |
+
```bash
|
| 55 |
+
cd run_scripts/caption
|
| 56 |
+
nohup sh train_caption_stage1.sh & # stage1, train with cross-entropy loss
|
| 57 |
+
nohup sh train_caption_stage2.sh & # stage2, load the best ckpt of stage1 and train with CIDEr optimization
|
| 58 |
+
```
|
| 59 |
+
3. Inference
|
| 60 |
+
```bash
|
| 61 |
+
cd run_scripts/caption ; sh evaluate_caption.sh # inference & evaluate
|
| 62 |
+
```
|
| 63 |
+
|
| 64 |
+
# Gallery
|
| 65 |
+
Below we provide examples of OFA in text-to-image generation and open-ended VQA. Also, we demonstrate its performance in unseen task (Grounded QA) as well as unseen domain (Visual Grounding on images from unseen domains).
|
| 66 |
+
|
| 67 |
+
## Text-to-Image Generation (normal query)
|
| 68 |
+

|
| 69 |
+
|
| 70 |
+
## Text-to-Image Generation (counterfactual query)
|
| 71 |
+

|
| 72 |
+
|
| 73 |
+
## Open-Ended VQA
|
| 74 |
+

|
| 75 |
+
|
| 76 |
+
## Grounded QA (unseen task)
|
| 77 |
+

|
| 78 |
+
|
| 79 |
+
## Viusal Grounding (unseen domain)
|
| 80 |
+

|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
## Citation
|
| 84 |
+
Please cite our paper if you find it helpful :)
|
| 85 |
+
|
| 86 |
+
```
|
| 87 |
+
@article{wang2022OFA,
|
| 88 |
+
title={Unifying Architectures, Tasks, and Modalities Through a Simple Sequence-to-Sequence Learning Framework},
|
| 89 |
+
author={Wang, Peng and Yang, An and Men, Rui and Lin, Junyang and Bai, Shuai and Li, Zhikang and Ma, Jianxin and Zhou, Chang and Zhou, Jingren and Yang, Hongxia},
|
| 90 |
+
journal={arXiv e-prints},
|
| 91 |
+
pages={arXiv--2202},
|
| 92 |
+
year={2022}
|
| 93 |
+
}
|
| 94 |
+
```
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
## Related Codebase
|
| 98 |
+
* [fairseq](https://github.com/pytorch/fairseq)
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
## License
|
| 102 |
+
Apache-2.0
|
app.py
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import os
|
| 3 |
+
import torch
|
| 4 |
+
import numpy as np
|
| 5 |
+
from fairseq import utils,tasks
|
| 6 |
+
from utils import checkpoint_utils
|
| 7 |
+
from utils.eval_utils import eval_step
|
| 8 |
+
from tasks.mm_tasks.caption import CaptionTask
|
| 9 |
+
from models.ofa import OFAModel
|
| 10 |
+
from PIL import Image
|
| 11 |
+
from torchvision import transforms
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
# Register caption task
|
| 15 |
+
tasks.register_task('caption',CaptionTask)
|
| 16 |
+
# turn on cuda if GPU is available
|
| 17 |
+
use_cuda = torch.cuda.is_available()
|
| 18 |
+
# use fp16 only when GPU is available
|
| 19 |
+
use_fp16 = False
|
| 20 |
+
|
| 21 |
+
os.system('wget https://ofa-silicon.oss-us-west-1.aliyuncs.com/checkpoints/caption_large_best_clean.pt')
|
| 22 |
+
os.system('mkdir -p checkpoints')
|
| 23 |
+
os.system('mv caption_large_best_clean.pt checkpoints/caption.pt')
|
| 24 |
+
|
| 25 |
+
# Load pretrained ckpt & config
|
| 26 |
+
overrides = {"bpe_dir": "utils/BPE", "eval_cider": False, "beam": 5,
|
| 27 |
+
"max_len_b": 16, "no_repeat_ngram_size": 3, "seed": 7}
|
| 28 |
+
models, cfg, task = checkpoint_utils.load_model_ensemble_and_task(
|
| 29 |
+
utils.split_paths('checkpoints/caption.pt'),
|
| 30 |
+
arg_overrides=overrides
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
# Move models to GPU
|
| 34 |
+
for model in models:
|
| 35 |
+
model.eval()
|
| 36 |
+
if use_fp16:
|
| 37 |
+
model.half()
|
| 38 |
+
if use_cuda and not cfg.distributed_training.pipeline_model_parallel:
|
| 39 |
+
model.cuda()
|
| 40 |
+
model.prepare_for_inference_(cfg)
|
| 41 |
+
|
| 42 |
+
# Initialize generator
|
| 43 |
+
generator = task.build_generator(models, cfg.generation)
|
| 44 |
+
|
| 45 |
+
mean = [0.5, 0.5, 0.5]
|
| 46 |
+
std = [0.5, 0.5, 0.5]
|
| 47 |
+
|
| 48 |
+
patch_resize_transform = transforms.Compose([
|
| 49 |
+
lambda image: image.convert("RGB"),
|
| 50 |
+
transforms.Resize((cfg.task.patch_image_size, cfg.task.patch_image_size), interpolation=Image.BICUBIC),
|
| 51 |
+
transforms.ToTensor(),
|
| 52 |
+
transforms.Normalize(mean=mean, std=std),
|
| 53 |
+
])
|
| 54 |
+
|
| 55 |
+
# Text preprocess
|
| 56 |
+
bos_item = torch.LongTensor([task.src_dict.bos()])
|
| 57 |
+
eos_item = torch.LongTensor([task.src_dict.eos()])
|
| 58 |
+
pad_idx = task.src_dict.pad()
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def encode_text(text, length=None, append_bos=False, append_eos=False):
|
| 62 |
+
s = task.tgt_dict.encode_line(
|
| 63 |
+
line=task.bpe.encode(text),
|
| 64 |
+
add_if_not_exist=False,
|
| 65 |
+
append_eos=False
|
| 66 |
+
).long()
|
| 67 |
+
if length is not None:
|
| 68 |
+
s = s[:length]
|
| 69 |
+
if append_bos:
|
| 70 |
+
s = torch.cat([bos_item, s])
|
| 71 |
+
if append_eos:
|
| 72 |
+
s = torch.cat([s, eos_item])
|
| 73 |
+
return s
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
# Construct input for caption task
|
| 77 |
+
def construct_sample(image: Image):
|
| 78 |
+
patch_image = patch_resize_transform(image).unsqueeze(0)
|
| 79 |
+
patch_mask = torch.tensor([True])
|
| 80 |
+
src_text = encode_text(" what does the image describe?", append_bos=True, append_eos=True).unsqueeze(0)
|
| 81 |
+
src_length = torch.LongTensor([s.ne(pad_idx).long().sum() for s in src_text])
|
| 82 |
+
sample = {
|
| 83 |
+
"id": np.array(['42']),
|
| 84 |
+
"net_input": {
|
| 85 |
+
"src_tokens": src_text,
|
| 86 |
+
"src_lengths": src_length,
|
| 87 |
+
"patch_images": patch_image,
|
| 88 |
+
"patch_masks": patch_mask
|
| 89 |
+
}
|
| 90 |
+
}
|
| 91 |
+
return sample
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
# Function to turn FP32 to FP16
|
| 95 |
+
def apply_half(t):
|
| 96 |
+
if t.dtype is torch.float32:
|
| 97 |
+
return t.to(dtype=torch.half)
|
| 98 |
+
return t
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
# Function for image captioning
|
| 102 |
+
def image_caption(inp):
|
| 103 |
+
sample = construct_sample(inp)
|
| 104 |
+
sample = utils.move_to_cuda(sample) if use_cuda else sample
|
| 105 |
+
sample = utils.apply_to_sample(apply_half, sample) if use_fp16 else sample
|
| 106 |
+
with torch.no_grad():
|
| 107 |
+
result, scores = eval_step(task, generator, models, sample)
|
| 108 |
+
return result[0]['caption']
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
io = gr.Interface(fn=image_caption, inputs=gr.inputs.Image(type='pil'), outputs='text')
|
| 112 |
+
io.launch(debug=True)
|
requirements.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
-e ./fairseq/
|
| 2 |
+
ftfy==6.0.3
|
| 3 |
+
tensorboardX==2.4.1
|
| 4 |
+
pycocotools==2.0.4
|
| 5 |
+
pycocoevalcap==1.2
|