Seongyun commited on
Commit
da9108e
·
verified ·
1 Parent(s): bcfd22a

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +137 -1
README.md CHANGED
@@ -73,4 +73,140 @@ Also, we use the following output format. During inference, you could parse the
73
  [RESULT] {orig_score}
74
  ```
75
  ## License
76
- Perception Collection and Prometheus-Vision are subject to OpenAI's Terms of Use for the generated data. If you suspect any violations, please reach out to us.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  [RESULT] {orig_score}
74
  ```
75
  ## License
76
+ Perception Collection and Prometheus-Vision are subject to OpenAI's Terms of Use for the generated data. If you suspect any violations, please reach out to us.
77
+ # Usage
78
+ Find below some example scripts on how to use the model in `transformers`:
79
+ # Citation
80
+ If you find the following model helpful, please consider citing our paper!
81
+ ## Using the Pytorch model
82
+ ### Running the model on a GPU
83
+ <details>
84
+ <summary> Click to expand </summary>
85
+
86
+ ```python
87
+ import argparse
88
+ import torch
89
+ import os
90
+ import json
91
+ from tqdm import tqdm
92
+ import shortuuid
93
+
94
+ from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
95
+ from llava.conversation import conv_templates, SeparatorStyle
96
+ from llava.model.builder import load_pretrained_model
97
+ from llava.utils import disable_torch_init
98
+ from llava.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria
99
+
100
+ from PIL import Image
101
+ import math
102
+
103
+
104
+ def split_list(lst, n):
105
+ """Split a list into n (roughly) equal-sized chunks"""
106
+ chunk_size = math.ceil(len(lst) / n) # integer division
107
+ return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
108
+
109
+
110
+ def get_chunk(lst, n, k):
111
+ chunks = split_list(lst, n)
112
+ return chunks[k]
113
+
114
+
115
+ def eval_model(args):
116
+ # Model
117
+ disable_torch_init()
118
+ model_path = 'kaist-ai/prometheus-vision-13b-v1.0'
119
+ model_name = 'llava-v1.5'
120
+ tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
121
+
122
+ questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")]
123
+ questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
124
+ answers_file = os.path.expanduser(args.answers_file)
125
+ os.makedirs(os.path.dirname(answers_file), exist_ok=True)
126
+ ans_file = open(answers_file, "w")
127
+ for line in tqdm(questions):
128
+ idx = line["question_id"]
129
+ image_file = line["image"]
130
+ qs = line["text"]
131
+ cur_prompt = qs
132
+ if model.config.mm_use_im_start_end:
133
+ qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
134
+ else:
135
+ qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
136
+
137
+ conv = conv_templates[args.conv_mode].copy()
138
+ conv.append_message(conv.roles[0], qs)
139
+ conv.append_message(conv.roles[1], None)
140
+ prompt = conv.get_prompt()
141
+
142
+ input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
143
+
144
+ image = Image.open(os.path.join(args.image_folder, image_file))
145
+ image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
146
+
147
+ stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
148
+ keywords = [stop_str]
149
+ stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
150
+
151
+ with torch.inference_mode():
152
+ output_ids = model.generate(
153
+ input_ids,
154
+ images=image_tensor.unsqueeze(0).half().cuda(),
155
+ do_sample=True if args.temperature > 0 else False,
156
+ temperature=args.temperature,
157
+ top_p=args.top_p,
158
+ num_beams=args.num_beams,
159
+ # no_repeat_ngram_size=3,
160
+ max_new_tokens=1024,
161
+ use_cache=True)
162
+
163
+ input_token_len = input_ids.shape[1]
164
+ n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
165
+ if n_diff_input_output > 0:
166
+ print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
167
+ outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
168
+ outputs = outputs.strip()
169
+ if outputs.endswith(stop_str):
170
+ outputs = outputs[:-len(stop_str)]
171
+ outputs = outputs.strip()
172
+
173
+ ans_id = shortuuid.uuid()
174
+ ans_file.write(json.dumps({"question_id": idx,
175
+ "prompt": cur_prompt,
176
+ "text": outputs,
177
+ "answer_id": ans_id,
178
+ "model_id": model_name,
179
+ "metadata": {}}) + "\n")
180
+ ans_file.flush()
181
+ ans_file.close()
182
+
183
+ if __name__ == "__main__":
184
+ parser = argparse.ArgumentParser()
185
+ parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
186
+ parser.add_argument("--model-base", type=str, default=None)
187
+ parser.add_argument("--image-folder", type=str, default="")
188
+ parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
189
+ parser.add_argument("--answers-file", type=str, default="answer.jsonl")
190
+ parser.add_argument("--conv-mode", type=str, default="llava_v1")
191
+ parser.add_argument("--num-chunks", type=int, default=1)
192
+ parser.add_argument("--chunk-idx", type=int, default=0)
193
+ parser.add_argument("--temperature", type=float, default=0.2)
194
+ parser.add_argument("--top_p", type=float, default=None)
195
+ parser.add_argument("--num_beams", type=int, default=1)
196
+ args = parser.parse_args()
197
+
198
+ eval_model(args)
199
+
200
+ ```
201
+ </details>
202
+ **BibTeX:**
203
+ ```bibtex
204
+ @misc{lee2024prometheusvision,
205
+ title={Prometheus-Vision: Vision-Language Model as a Judge for Fine-Grained Evaluation},
206
+ author={Seongyun Lee and Seungone Kim and Sue Hyun Park and Geewook Kim and Minjoon Seo},
207
+ year={2024},
208
+ eprint={2401.06591},
209
+ archivePrefix={arXiv},
210
+ primaryClass={cs.CL}
211
+ }
212
+ ```