import gradio as gr import pandas as pd import os import uuid import datetime import logging from huggingface_hub import hf_hub_download, upload_file, list_repo_tree from dotenv import load_dotenv load_dotenv() # Configuration HF_INPUT_DATASET = os.getenv("HF_INPUT_DATASET") HF_INPUT_DATASET_PATH = os.getenv("HF_INPUT_DATASET_PATH") HF_INPUT_DATASET_ID_COLUMN = os.getenv("HF_INPUT_DATASET_ID_COLUMN") HF_INPUT_DATASET_COLUMN_A = os.getenv("HF_INPUT_DATASET_COLUMN_A") HF_INPUT_DATASET_COLUMN_B = os.getenv("HF_INPUT_DATASET_COLUMN_B") HF_OUTPUT_DATASET = os.getenv("HF_OUTPUT_DATASET") HF_OUTPUT_DATASET_DIR = os.getenv("HF_OUTPUT_DATASET_DIR") INSTRUCTIONS = """ # Pairwise Model Output Labeling Please compare the two model outputs shown below and select which one you think is better. - Choose "Left is better" if the left output is superior - Choose "Right is better" if the right output is superior - Choose "Tie" if they are equally good or bad - Choose "Can't choose" if you cannot make a determination """ SAVE_EVERY_N_EXAMPLES = 5 class PairwiseLabeler: def __init__(self): self.current_index = 0 self.results = [] self.df = self.read_hf_dataset() def __len__(self): return len(self.df) def read_hf_dataset(self) -> pd.DataFrame: try: local_file = hf_hub_download(repo_id=HF_INPUT_DATASET, repo_type="dataset", filename=HF_INPUT_DATASET_PATH) if local_file.endswith(".json"): return pd.read_json(local_file) elif local_file.endswith(".jsonl"): return pd.read_json(local_file, orient="records",lines=True) elif local_file.endswith(".csv"): return pd.read_csv(local_file) elif local_file.endswith(".parquet"): return pd.read_parquet(local_file) else: raise ValueError(f"Unsupported file type: {local_file}") except Exception as e: # Fallback to sample data if loading fails logging.error(f"Couldn't read HF dataset from {HF_INPUT_DATASET_PATH}. Using sample data instead.") sample_data = { HF_INPUT_DATASET_ID_COLUMN: [f"sample_{i}" for i in range(SAVE_EVERY_N_EXAMPLES)], HF_INPUT_DATASET_COLUMN_A: [f"This is sample generation A {i}" for i in range(SAVE_EVERY_N_EXAMPLES)], HF_INPUT_DATASET_COLUMN_B: [f"This is sample generation B {i}" for i in range(SAVE_EVERY_N_EXAMPLES)], } return pd.DataFrame(sample_data) def get_current_pair(self): if self.current_index >= len(self.df): return None, None, None item = self.df.iloc[self.current_index] item_id = item.get(HF_INPUT_DATASET_ID_COLUMN, f"item_{self.current_index}") left_text = item.get(HF_INPUT_DATASET_COLUMN_A, "") right_text = item.get(HF_INPUT_DATASET_COLUMN_B, "") return item_id, left_text, right_text def submit_judgment(self, item_id, left_text, right_text, choice): if item_id is None: return item_id, left_text, right_text, self.current_index # Record the judgment result = { "item_id": item_id, "generation_a": left_text, "generation_b": right_text, "judgment": choice, "timestamp": datetime.datetime.now().isoformat(), "labeler_id": str(uuid.uuid4())[:8] # Anonymous ID for the labeling session } self.results.append(result) # Move to next item self.current_index += 1 # Save results periodically if len(self.results) % SAVE_EVERY_N_EXAMPLES == 0: self.save_results() # Get next pair next_id, next_left, next_right = self.get_current_pair() return next_id, next_left, next_right, self.current_index def save_results(self): if not self.results: return try: # Convert results to dataset format results_df = pd.DataFrame(self.results) results_df.to_json("temp.jsonl", orient="records", lines=True) # Push to Hugging Face Hub try: num_files = len([_ for _ in list_repo_tree(repo_id=HF_OUTPUT_DATASET, repo_type="dataset", path_in_repo=HF_OUTPUT_DATASET_DIR)]) except Exception as e: num_files = 0 upload_file(repo_id=HF_OUTPUT_DATASET, repo_type="dataset", path_in_repo=os.path.join(HF_OUTPUT_DATASET_DIR, f"results_{num_files+1}.jsonl"), path_or_fileobj="temp.jsonl") os.remove("temp.jsonl") self.results = [] logging.info(f"Saved {len(self.results)} results to {HF_OUTPUT_DATASET}") except Exception as e: logging.error(f"Error saving results: {e}") # Initialize the labeler labeler = PairwiseLabeler() # Get the first pair initial_id, initial_left, initial_right = labeler.get_current_pair() with gr.Blocks() as app: gr.Markdown(INSTRUCTIONS) with gr.Row(): with gr.Column(): left_output = gr.Textbox( value=initial_left, label="Model Output A", lines=10, interactive=False ) with gr.Column(): right_output = gr.Textbox( value=initial_right, label="Model Output B", lines=10, interactive=False ) item_id = gr.Textbox(value=initial_id, visible=False) with gr.Row(): left_btn = gr.Button("⬅️ A is better", variant="primary") right_btn = gr.Button("➡️ B is better", variant="primary") tie_btn = gr.Button("🤝 Tie", variant="primary") cant_choose_btn = gr.Button("🤔 Can't choose") current_sample_sld = gr.Slider(minimum=0, maximum=len(labeler), step=1, value=labeler.current_index, interactive=False, label='sample_ind', info=f"Samples labeled (out of {len(labeler)})", show_label=False, container=False, scale=5) def judge_left(item_id, left_text, right_text): return judge("A is better", item_id, left_text, right_text) def judge_right(item_id, left_text, right_text): return judge("B is better", item_id, left_text, right_text) def judge_tie(item_id, left_text, right_text): return judge("Tie", item_id, left_text, right_text) def judge_cant_choose(item_id, left_text, right_text): return judge("Can't choose", item_id, left_text, right_text) def judge(choice, item_id, left_text, right_text): new_id, new_left, new_right, new_index = labeler.submit_judgment( item_id, left_text, right_text, choice ) return new_id, new_left, new_right, new_index left_btn.click( judge_left, inputs=[item_id, left_output, right_output], outputs=[item_id, left_output, right_output, current_sample_sld] ) right_btn.click( judge_right, inputs=[item_id, left_output, right_output], outputs=[item_id, left_output, right_output, current_sample_sld] ) tie_btn.click( judge_tie, inputs=[item_id, left_output, right_output], outputs=[item_id, left_output, right_output, current_sample_sld] ) cant_choose_btn.click( judge_cant_choose, inputs=[item_id, left_output, right_output], outputs=[item_id, left_output, right_output, current_sample_sld] ) if __name__ == "__main__": app.launch()