saridormi's picture
initial commit
f053717
raw
history blame
7.94 kB
import gradio as gr
import pandas as pd
import os
import uuid
import datetime
import logging
from huggingface_hub import hf_hub_download, upload_file, list_repo_tree
from dotenv import load_dotenv
load_dotenv()
# Configuration
HF_INPUT_DATASET = os.getenv("HF_INPUT_DATASET")
HF_INPUT_DATASET_PATH = os.getenv("HF_INPUT_DATASET_PATH")
HF_INPUT_DATASET_ID_COLUMN = os.getenv("HF_INPUT_DATASET_ID_COLUMN")
HF_INPUT_DATASET_COLUMN_A = os.getenv("HF_INPUT_DATASET_COLUMN_A")
HF_INPUT_DATASET_COLUMN_B = os.getenv("HF_INPUT_DATASET_COLUMN_B")
HF_OUTPUT_DATASET = os.getenv("HF_OUTPUT_DATASET")
HF_OUTPUT_DATASET_DIR = os.getenv("HF_OUTPUT_DATASET_DIR")
INSTRUCTIONS = """
# Pairwise Model Output Labeling
Please compare the two model outputs shown below and select which one you think is better.
- Choose "Left is better" if the left output is superior
- Choose "Right is better" if the right output is superior
- Choose "Tie" if they are equally good or bad
- Choose "Can't choose" if you cannot make a determination
"""
SAVE_EVERY_N_EXAMPLES = 5
class PairwiseLabeler:
def __init__(self):
self.current_index = 0
self.results = []
self.df = self.read_hf_dataset()
def __len__(self):
return len(self.df)
def read_hf_dataset(self) -> pd.DataFrame:
try:
local_file = hf_hub_download(repo_id=HF_INPUT_DATASET, repo_type="dataset", filename=HF_INPUT_DATASET_PATH)
if local_file.endswith(".json"):
return pd.read_json(local_file)
elif local_file.endswith(".jsonl"):
return pd.read_json(local_file, orient="records",lines=True)
elif local_file.endswith(".csv"):
return pd.read_csv(local_file)
elif local_file.endswith(".parquet"):
return pd.read_parquet(local_file)
else:
raise ValueError(f"Unsupported file type: {local_file}")
except Exception as e:
# Fallback to sample data if loading fails
logging.error(f"Couldn't read HF dataset from {HF_INPUT_DATASET_PATH}. Using sample data instead.")
sample_data = {
HF_INPUT_DATASET_ID_COLUMN: [f"sample_{i}" for i in range(SAVE_EVERY_N_EXAMPLES)],
HF_INPUT_DATASET_COLUMN_A: [f"This is sample generation A {i}" for i in range(SAVE_EVERY_N_EXAMPLES)],
HF_INPUT_DATASET_COLUMN_B: [f"This is sample generation B {i}" for i in range(SAVE_EVERY_N_EXAMPLES)],
}
return pd.DataFrame(sample_data)
def get_current_pair(self):
if self.current_index >= len(self.df):
return None, None, None
item = self.df.iloc[self.current_index]
item_id = item.get(HF_INPUT_DATASET_ID_COLUMN, f"item_{self.current_index}")
left_text = item.get(HF_INPUT_DATASET_COLUMN_A, "")
right_text = item.get(HF_INPUT_DATASET_COLUMN_B, "")
return item_id, left_text, right_text
def submit_judgment(self, item_id, left_text, right_text, choice):
if item_id is None:
return item_id, left_text, right_text, self.current_index
# Record the judgment
result = {
"item_id": item_id,
"generation_a": left_text,
"generation_b": right_text,
"judgment": choice,
"timestamp": datetime.datetime.now().isoformat(),
"labeler_id": str(uuid.uuid4())[:8] # Anonymous ID for the labeling session
}
self.results.append(result)
# Move to next item
self.current_index += 1
# Save results periodically
if len(self.results) % SAVE_EVERY_N_EXAMPLES == 0:
self.save_results()
# Get next pair
next_id, next_left, next_right = self.get_current_pair()
return next_id, next_left, next_right, self.current_index
def save_results(self):
if not self.results:
return
try:
# Convert results to dataset format
results_df = pd.DataFrame(self.results)
results_df.to_json("temp.jsonl", orient="records", lines=True)
# Push to Hugging Face Hub
try:
num_files = len([_ for _ in list_repo_tree(repo_id=HF_OUTPUT_DATASET, repo_type="dataset", path_in_repo=HF_OUTPUT_DATASET_DIR)])
except Exception as e:
num_files = 0
upload_file(repo_id=HF_OUTPUT_DATASET, repo_type="dataset", path_in_repo=os.path.join(HF_OUTPUT_DATASET_DIR, f"results_{num_files+1}.jsonl"), path_or_fileobj="temp.jsonl")
os.remove("temp.jsonl")
self.results = []
logging.info(f"Saved {len(self.results)} results to {HF_OUTPUT_DATASET}")
except Exception as e:
logging.error(f"Error saving results: {e}")
# Initialize the labeler
labeler = PairwiseLabeler()
# Get the first pair
initial_id, initial_left, initial_right = labeler.get_current_pair()
with gr.Blocks() as app:
gr.Markdown(INSTRUCTIONS)
with gr.Row():
with gr.Column():
left_output = gr.Textbox(
value=initial_left,
label="Model Output A",
lines=10,
interactive=False
)
with gr.Column():
right_output = gr.Textbox(
value=initial_right,
label="Model Output B",
lines=10,
interactive=False
)
item_id = gr.Textbox(value=initial_id, visible=False)
with gr.Row():
left_btn = gr.Button("⬅️ A is better", variant="primary")
right_btn = gr.Button("➡️ B is better", variant="primary")
tie_btn = gr.Button("🤝 Tie", variant="primary")
cant_choose_btn = gr.Button("🤔 Can't choose")
current_sample_sld = gr.Slider(minimum=0, maximum=len(labeler), step=1,
value=labeler.current_index,
interactive=False,
label='sample_ind',
info=f"Samples labeled (out of {len(labeler)})",
show_label=False,
container=False,
scale=5)
def judge_left(item_id, left_text, right_text):
return judge("A is better", item_id, left_text, right_text)
def judge_right(item_id, left_text, right_text):
return judge("B is better", item_id, left_text, right_text)
def judge_tie(item_id, left_text, right_text):
return judge("Tie", item_id, left_text, right_text)
def judge_cant_choose(item_id, left_text, right_text):
return judge("Can't choose", item_id, left_text, right_text)
def judge(choice, item_id, left_text, right_text):
new_id, new_left, new_right, new_index = labeler.submit_judgment(
item_id, left_text, right_text, choice
)
return new_id, new_left, new_right, new_index
left_btn.click(
judge_left,
inputs=[item_id, left_output, right_output],
outputs=[item_id, left_output, right_output, current_sample_sld]
)
right_btn.click(
judge_right,
inputs=[item_id, left_output, right_output],
outputs=[item_id, left_output, right_output, current_sample_sld]
)
tie_btn.click(
judge_tie,
inputs=[item_id, left_output, right_output],
outputs=[item_id, left_output, right_output, current_sample_sld]
)
cant_choose_btn.click(
judge_cant_choose,
inputs=[item_id, left_output, right_output],
outputs=[item_id, left_output, right_output, current_sample_sld]
)
if __name__ == "__main__":
app.launch()