Spaces:
Paused
Paused
File size: 4,281 Bytes
5885496 3d9fba4 5885496 3d9fba4 5885496 3d9fba4 5885496 3d9fba4 5885496 3d9fba4 5885496 3d9fba4 5885496 3d9fba4 5885496 3d9fba4 5885496 3d9fba4 5885496 3d9fba4 5885496 3d9fba4 5885496 3d9fba4 5885496 3d9fba4 5885496 3d9fba4 5885496 3d9fba4 5885496 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
import argparse
import json
import os
import time
import openai
import ray
import tqdm
@ray.remote(num_cpus=4)
def get_eval(content: str, max_tokens: int):
while True:
try:
response = openai.ChatCompletion.create(
model="gpt-4",
messages=[
{
"role": "system",
"content": "You are a helpful and precise assistant for checking the quality of the answer.",
},
{
"role": "user",
"content": content,
},
],
temperature=0.2, # TODO: figure out which temperature is best for evaluation
max_tokens=max_tokens,
)
break
except openai.error.RateLimitError:
pass
except Exception as e:
print(e)
time.sleep(1)
print("success!")
return response["choices"][0]["message"]["content"]
def parse_score(review):
try:
score_pair = review.split("\n")[0]
score_pair = score_pair.replace(",", " ")
sp = score_pair.split(" ")
if len(sp) == 2:
return [float(sp[0]), float(sp[1])]
else:
print("error", review)
return [-1, -1]
except Exception as e:
print(e)
print("error", review)
return [-1, -1]
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="ChatGPT-based QA evaluation.")
parser.add_argument("-q", "--question")
parser.add_argument("-c", "--context")
parser.add_argument("-a", "--answer-list", nargs="+", default=[])
parser.add_argument("-r", "--rule")
parser.add_argument("-o", "--output")
parser.add_argument(
"--max-tokens",
type=int,
default=1024,
help="maximum number of tokens produced in the output",
)
args = parser.parse_args()
ray.init()
f_q = open(os.path.expanduser(args.question))
f_ans1 = open(os.path.expanduser(args.answer_list[0]))
f_ans2 = open(os.path.expanduser(args.answer_list[1]))
rule_dict = json.load(open(os.path.expanduser(args.rule), "r"))
review_file = open(f"{args.output}", "w")
context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))]
image_to_context = {context["image"]: context for context in context_list}
js_list = []
handles = []
idx = 0
for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
ques = json.loads(ques_js)
ans1 = json.loads(ans1_js)
ans2 = json.loads(ans2_js)
inst = image_to_context[ques["image"]]
cap_str = "\n".join(inst["captions"])
box_str = "\n".join(
[
f'{instance["category"]}: {instance["bbox"]}'
for instance in inst["instances"]
]
)
category = json.loads(ques_js)["category"]
if category in rule_dict:
rule = rule_dict[category]
else:
assert False, f"Visual QA category not found in rule file: {category}."
prompt = rule["prompt"]
role = rule["role"]
content = (
f"[Context]\n{cap_str}\n\n{box_str}\n\n"
f'[Question]\n{ques["text"]}\n\n'
f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
f"[System]\n{prompt}\n\n"
)
js_list.append(
{
"id": idx + 1,
"question_id": ques["question_id"],
"answer1_id": ans1.get("answer_id", ans1["question_id"]),
"answer2_id": ans2.get("answer_id", ans2["answer_id"]),
"category": category,
}
)
idx += 1
handles.append(get_eval.remote(content, args.max_tokens))
# To avoid the rate limit set by OpenAI
time.sleep(1)
reviews = ray.get(handles)
for idx, review in enumerate(reviews):
scores = parse_score(review)
js_list[idx]["content"] = review
js_list[idx]["tuple"] = scores
review_file.write(json.dumps(js_list[idx]) + "\n")
review_file.close()
|