lisa-on-cuda / model /llava /eval /eval_gpt_review_visual.py
x-lai
Release training script
3d9fba4
raw
history blame
4.28 kB
import argparse
import json
import os
import time
import openai
import ray
import tqdm
@ray.remote(num_cpus=4)
def get_eval(content: str, max_tokens: int):
while True:
try:
response = openai.ChatCompletion.create(
model="gpt-4",
messages=[
{
"role": "system",
"content": "You are a helpful and precise assistant for checking the quality of the answer.",
},
{
"role": "user",
"content": content,
},
],
temperature=0.2, # TODO: figure out which temperature is best for evaluation
max_tokens=max_tokens,
)
break
except openai.error.RateLimitError:
pass
except Exception as e:
print(e)
time.sleep(1)
print("success!")
return response["choices"][0]["message"]["content"]
def parse_score(review):
try:
score_pair = review.split("\n")[0]
score_pair = score_pair.replace(",", " ")
sp = score_pair.split(" ")
if len(sp) == 2:
return [float(sp[0]), float(sp[1])]
else:
print("error", review)
return [-1, -1]
except Exception as e:
print(e)
print("error", review)
return [-1, -1]
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="ChatGPT-based QA evaluation.")
parser.add_argument("-q", "--question")
parser.add_argument("-c", "--context")
parser.add_argument("-a", "--answer-list", nargs="+", default=[])
parser.add_argument("-r", "--rule")
parser.add_argument("-o", "--output")
parser.add_argument(
"--max-tokens",
type=int,
default=1024,
help="maximum number of tokens produced in the output",
)
args = parser.parse_args()
ray.init()
f_q = open(os.path.expanduser(args.question))
f_ans1 = open(os.path.expanduser(args.answer_list[0]))
f_ans2 = open(os.path.expanduser(args.answer_list[1]))
rule_dict = json.load(open(os.path.expanduser(args.rule), "r"))
review_file = open(f"{args.output}", "w")
context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))]
image_to_context = {context["image"]: context for context in context_list}
js_list = []
handles = []
idx = 0
for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
ques = json.loads(ques_js)
ans1 = json.loads(ans1_js)
ans2 = json.loads(ans2_js)
inst = image_to_context[ques["image"]]
cap_str = "\n".join(inst["captions"])
box_str = "\n".join(
[
f'{instance["category"]}: {instance["bbox"]}'
for instance in inst["instances"]
]
)
category = json.loads(ques_js)["category"]
if category in rule_dict:
rule = rule_dict[category]
else:
assert False, f"Visual QA category not found in rule file: {category}."
prompt = rule["prompt"]
role = rule["role"]
content = (
f"[Context]\n{cap_str}\n\n{box_str}\n\n"
f'[Question]\n{ques["text"]}\n\n'
f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
f"[System]\n{prompt}\n\n"
)
js_list.append(
{
"id": idx + 1,
"question_id": ques["question_id"],
"answer1_id": ans1.get("answer_id", ans1["question_id"]),
"answer2_id": ans2.get("answer_id", ans2["answer_id"]),
"category": category,
}
)
idx += 1
handles.append(get_eval.remote(content, args.max_tokens))
# To avoid the rate limit set by OpenAI
time.sleep(1)
reviews = ray.get(handles)
for idx, review in enumerate(reviews):
scores = parse_score(review)
js_list[idx]["content"] = review
js_list[idx]["tuple"] = scores
review_file.write(json.dumps(js_list[idx]) + "\n")
review_file.close()