Spaces:
Sleeping
Sleeping
Initial commit
Browse files- app.py +82 -0
- app_utils.py +97 -0
- evaluation_utils.py +69 -0
- requirements.txt +5 -0
- test.jsonl +0 -0
- train.jsonl +0 -0
app.py
ADDED
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
|
3 |
+
from app_utils import evaluate_prompt, get_split
|
4 |
+
|
5 |
+
import logging
|
6 |
+
|
7 |
+
logging.basicConfig(level=logging.INFO)
|
8 |
+
|
9 |
+
|
10 |
+
with gr.Blocks(title=f"Prompting Challenge ({get_split()}") as demo:
|
11 |
+
gr.Markdown(
|
12 |
+
f"""
|
13 |
+
# Prompting Challenge
|
14 |
+
### ({get_split()})
|
15 |
+
""" + """
|
16 |
+
The goal of this challenge is to prompt GPT-4 to "unscramble" a sentence.
|
17 |
+
|
18 |
+
The input is a sentence with scrambled word order, e.g.: *"are How ? you"*
|
19 |
+
|
20 |
+
GPT-4 should identify the original sentence, e.g.: *"How are you?"*
|
21 |
+
|
22 |
+
Enter your prompt template here. Use `{% shuffled_sentence %}` at the place where you want the shuffled sentence to be inserted.
|
23 |
+
"""
|
24 |
+
)
|
25 |
+
|
26 |
+
input_text = gr.Textbox(
|
27 |
+
lines=10,
|
28 |
+
label="Input Text",
|
29 |
+
value="Unscramble the following sentence: {% shuffled_sentence %}"
|
30 |
+
)
|
31 |
+
submit_button = gr.Button("Submit")
|
32 |
+
results_output = gr.HTML(label="Results")
|
33 |
+
|
34 |
+
def update_results(prompt):
|
35 |
+
result_tuples = list(evaluate_prompt(prompt))
|
36 |
+
if result_tuples:
|
37 |
+
total_score = sum(item_score for _, _, _, item_score in result_tuples)
|
38 |
+
score = total_score / len(result_tuples)
|
39 |
+
else:
|
40 |
+
score = 0
|
41 |
+
html_output = "<dl style='font-family: Arial, sans-serif;'>"
|
42 |
+
html_output += f"<h2 style='color: #333; margin-top: 20px; margin-bottom: 20px;'>Accuracy: {100 * score:.1f}%</h2>"
|
43 |
+
newline = '\n'
|
44 |
+
for index, (original, prompt, response, item_score) in enumerate(result_tuples, 1):
|
45 |
+
background_color = "#fff4ea" if item_score < 0.5 else "#e4ffe4" if item_score > 0.9 else "whitesmoke"
|
46 |
+
html_output += f"""
|
47 |
+
<div style='background-color: {background_color}; padding: 10px; margin-bottom: 20px;'>
|
48 |
+
<h3 style='color: #333; margin-top: 0;'>Test item #{index}</h3>
|
49 |
+
<dt style='padding: 5px;'>
|
50 |
+
<span style='font-weight: 600;'>Original Sentence:</span>
|
51 |
+
</dt>
|
52 |
+
<dd style='margin-left: 20px; padding: 5px;'>{original.replace(newline, "<br>")}</dd>
|
53 |
+
|
54 |
+
<dt style='padding: 5px;'>
|
55 |
+
<span style='font-weight: 600;'>Prompt:</span>
|
56 |
+
</dt>
|
57 |
+
<dd style='margin-left: 20px; padding: 5px;'>{prompt.replace(newline, "<br>")}</dd>
|
58 |
+
|
59 |
+
<dt style='padding: 5px;'>
|
60 |
+
<span style='font-weight: 600;'>Response by GPT-4:</span>
|
61 |
+
</dt>
|
62 |
+
<dd style='margin-left: 20px; padding: 5px;font-style: italic;'>{response.replace(newline, "<br>")}</dd>
|
63 |
+
<dt style='padding: 5px;'>
|
64 |
+
<span style='font-weight: 600;'>Score:</span>
|
65 |
+
</dt>
|
66 |
+
<dd style='margin-left: 20px; padding: 5px;'>
|
67 |
+
<span style='color: #333;'>{100 * item_score:.1f}%</span>
|
68 |
+
</dd>
|
69 |
+
</div>
|
70 |
+
"""
|
71 |
+
html_output += "</dl>"
|
72 |
+
return html_output
|
73 |
+
|
74 |
+
submit_button.click(
|
75 |
+
fn=update_results,
|
76 |
+
inputs=[input_text],
|
77 |
+
outputs=[results_output]
|
78 |
+
)
|
79 |
+
|
80 |
+
if __name__ == "__main__":
|
81 |
+
demo.launch()
|
82 |
+
# demo.launch(share=True)
|
app_utils.py
ADDED
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from pathlib import Path
|
3 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
4 |
+
|
5 |
+
import gradio as gr
|
6 |
+
import jsonlines
|
7 |
+
from openai import OpenAI
|
8 |
+
from dotenv import load_dotenv
|
9 |
+
from evaluation_utils import evaluate_response
|
10 |
+
|
11 |
+
|
12 |
+
def get_split():
|
13 |
+
load_dotenv()
|
14 |
+
split = os.getenv("SPLIT")
|
15 |
+
if split == "train":
|
16 |
+
return "evaluation on development set"
|
17 |
+
elif split == "test":
|
18 |
+
return "evaluation on test set"
|
19 |
+
|
20 |
+
|
21 |
+
# Utility function to chunk a list into batches
|
22 |
+
def chunk_list(data, chunk_size):
|
23 |
+
for i in range(0, len(data), chunk_size):
|
24 |
+
yield data[i:i + chunk_size]
|
25 |
+
|
26 |
+
# Function to send an individual request to the OpenAI API
|
27 |
+
def send_request(client, prompt, index):
|
28 |
+
response = client.chat.completions.create(
|
29 |
+
model="gpt-4o-mini",
|
30 |
+
temperature=0,
|
31 |
+
seed=42,
|
32 |
+
messages=[
|
33 |
+
{"role": "system", "content": "You are a helpful assistant."},
|
34 |
+
{"role": "user", "content": prompt},
|
35 |
+
],
|
36 |
+
max_tokens=1024,
|
37 |
+
)
|
38 |
+
return index, response.choices[0].message.content
|
39 |
+
|
40 |
+
def evaluate_prompt(prompt: str, num_samples: int = None, split: str = None, batch_size: int = 5, progress=gr.Progress()):
|
41 |
+
progress(0, desc="Starting...")
|
42 |
+
load_dotenv()
|
43 |
+
|
44 |
+
if num_samples is None:
|
45 |
+
num_samples = int(os.getenv("NUM_SAMPLES"))
|
46 |
+
|
47 |
+
if split is None:
|
48 |
+
split = os.getenv("SPLIT")
|
49 |
+
assert split in ["train", "test"]
|
50 |
+
|
51 |
+
# Define the path to the test.jsonl file
|
52 |
+
test_file_path = Path(__file__).parent / "out" / f"{split}.jsonl"
|
53 |
+
|
54 |
+
# Load the data from the jsonl file
|
55 |
+
test_data = []
|
56 |
+
with jsonlines.open(test_file_path) as reader:
|
57 |
+
for item in reader:
|
58 |
+
test_data.append(item)
|
59 |
+
|
60 |
+
# Limit to first num_samples items for faster evaluation
|
61 |
+
test_data = test_data[:num_samples]
|
62 |
+
|
63 |
+
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
|
64 |
+
|
65 |
+
responses = [None] * num_samples # Pre-allocate a list to store responses in order
|
66 |
+
instantiated_prompts = []
|
67 |
+
|
68 |
+
# Create and process batches
|
69 |
+
for batch_data in chunk_list(test_data, batch_size):
|
70 |
+
# Prepare the prompts for this batch
|
71 |
+
batch_prompts = [
|
72 |
+
prompt.replace("{% shuffled_sentence %}", test_item["shuffled_tokenized"])
|
73 |
+
for test_item in batch_data
|
74 |
+
]
|
75 |
+
instantiated_prompts.extend(batch_prompts)
|
76 |
+
|
77 |
+
# Send requests in parallel using ThreadPoolExecutor
|
78 |
+
with ThreadPoolExecutor() as executor:
|
79 |
+
futures = {executor.submit(send_request, client, item_prompt, i): i for i, item_prompt in enumerate(batch_prompts, start=len(instantiated_prompts) - len(batch_prompts))}
|
80 |
+
|
81 |
+
for future in as_completed(futures):
|
82 |
+
try:
|
83 |
+
index, response = future.result()
|
84 |
+
responses[index] = response # Store the response at the correct index
|
85 |
+
except Exception as e:
|
86 |
+
print(f"Request failed: {e}")
|
87 |
+
responses[index] = "Error: Request failed"
|
88 |
+
|
89 |
+
# Update progress after each batch
|
90 |
+
progress(len(instantiated_prompts) / len(test_data), desc="Processing batches...")
|
91 |
+
|
92 |
+
# Evaluate responses
|
93 |
+
scores = []
|
94 |
+
for test_item, instantiated_prompt, response in zip(test_data, instantiated_prompts, responses):
|
95 |
+
score = evaluate_response(test_item["original_tokenized"], response)
|
96 |
+
scores.append(score)
|
97 |
+
yield (test_item["original_sentence"], instantiated_prompt, response, score)
|
evaluation_utils.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import spacy
|
3 |
+
from fast_sentence_tokenize import tokenize_text
|
4 |
+
|
5 |
+
|
6 |
+
def evaluate_response(original_tokenized: str, response: str) -> int:
|
7 |
+
"""
|
8 |
+
- Tokenize response string using spacy
|
9 |
+
- Create a list of response tokens
|
10 |
+
- Assign every original token a rank:
|
11 |
+
- Only look at the last mention of a token in the response
|
12 |
+
- Rank the tokens by how early they appear in the response (last mention only)
|
13 |
+
- Calculate ranking accuracy
|
14 |
+
|
15 |
+
Returns a value between 0 and 1
|
16 |
+
"""
|
17 |
+
original_tokenized = original_tokenized.strip().lower()
|
18 |
+
response = response.strip().lower()
|
19 |
+
|
20 |
+
# Tokenize response string using a simple regex-based tokenization
|
21 |
+
response_tokens = tokenize_text(response)
|
22 |
+
|
23 |
+
# Create a list of original tokens
|
24 |
+
original_tokens = original_tokenized.split()
|
25 |
+
|
26 |
+
# Create ranks for response tokens
|
27 |
+
response_token_ranks = {}
|
28 |
+
for token in original_tokens:
|
29 |
+
if token not in response_tokens:
|
30 |
+
return 0 # If any original token is missing from the response, return 0 immediately
|
31 |
+
|
32 |
+
# Create ranks for original tokens
|
33 |
+
original_token_ranks = {}
|
34 |
+
for i, token in enumerate(original_tokens):
|
35 |
+
original_token_ranks[token] = i
|
36 |
+
|
37 |
+
# Create ranks for response tokens
|
38 |
+
for token in original_tokens:
|
39 |
+
# Assign index of last occurrence of token in response
|
40 |
+
response_token_ranks[token] = len(response_tokens) - 1 - response_tokens[::-1].index(token)
|
41 |
+
|
42 |
+
# Normalize the response token ranks
|
43 |
+
sorted_ranks = sorted(set(response_token_ranks.values()))
|
44 |
+
rank_mapping = {old_rank: new_rank for new_rank, old_rank in enumerate(sorted_ranks)}
|
45 |
+
for token, rank in response_token_ranks.items():
|
46 |
+
response_token_ranks[token] = rank_mapping[rank]
|
47 |
+
|
48 |
+
# Calculate Kendall's tau
|
49 |
+
n = len(original_tokens)
|
50 |
+
concordant_pairs = 0
|
51 |
+
discordant_pairs = 0
|
52 |
+
|
53 |
+
for i in range(n):
|
54 |
+
for j in range(i + 1, n):
|
55 |
+
original_diff = original_token_ranks[original_tokens[i]] - original_token_ranks[original_tokens[j]]
|
56 |
+
response_diff = response_token_ranks[original_tokens[i]] - response_token_ranks[original_tokens[j]]
|
57 |
+
|
58 |
+
if original_diff * response_diff > 0:
|
59 |
+
concordant_pairs += 1
|
60 |
+
elif original_diff * response_diff < 0:
|
61 |
+
discordant_pairs += 1
|
62 |
+
|
63 |
+
total_pairs = n * (n - 1) // 2
|
64 |
+
kendall_tau = (concordant_pairs - discordant_pairs) / total_pairs
|
65 |
+
|
66 |
+
# Normalize Kendall's tau to be between 0 and 1
|
67 |
+
normalized_kendall_tau = (kendall_tau + 1) / 2
|
68 |
+
|
69 |
+
return normalized_kendall_tau
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
spacy
|
2 |
+
jsonlines
|
3 |
+
openai
|
4 |
+
python-dotenv
|
5 |
+
fast-sentence-tokenize
|
test.jsonl
ADDED
The diff for this file is too large to render.
See raw diff
|
|
train.jsonl
ADDED
The diff for this file is too large to render.
See raw diff
|
|