labeling-summarization

Sleeping

File size: 7,943 Bytes

f053717

import gradio as gr
import pandas as pd
import os
import uuid
import datetime
import logging
from huggingface_hub import hf_hub_download, upload_file, list_repo_tree
from dotenv import load_dotenv

load_dotenv()

# Configuration
HF_INPUT_DATASET = os.getenv("HF_INPUT_DATASET")
HF_INPUT_DATASET_PATH = os.getenv("HF_INPUT_DATASET_PATH")
HF_INPUT_DATASET_ID_COLUMN = os.getenv("HF_INPUT_DATASET_ID_COLUMN")
HF_INPUT_DATASET_COLUMN_A = os.getenv("HF_INPUT_DATASET_COLUMN_A")
HF_INPUT_DATASET_COLUMN_B = os.getenv("HF_INPUT_DATASET_COLUMN_B")
HF_OUTPUT_DATASET = os.getenv("HF_OUTPUT_DATASET")
HF_OUTPUT_DATASET_DIR = os.getenv("HF_OUTPUT_DATASET_DIR")
INSTRUCTIONS = """
# Pairwise Model Output Labeling

Please compare the two model outputs shown below and select which one you think is better.
- Choose "Left is better" if the left output is superior
- Choose "Right is better" if the right output is superior
- Choose "Tie" if they are equally good or bad
- Choose "Can't choose" if you cannot make a determination
"""
SAVE_EVERY_N_EXAMPLES = 5


class PairwiseLabeler:
    def __init__(self):
        self.current_index = 0
        self.results = []
        self.df = self.read_hf_dataset()
    
    def __len__(self):
        return len(self.df)

    def read_hf_dataset(self) -> pd.DataFrame:
        try:
            local_file = hf_hub_download(repo_id=HF_INPUT_DATASET, repo_type="dataset", filename=HF_INPUT_DATASET_PATH)
            if local_file.endswith(".json"):
                return pd.read_json(local_file)
            elif local_file.endswith(".jsonl"):
                return pd.read_json(local_file, orient="records",lines=True)
            elif local_file.endswith(".csv"):
                return pd.read_csv(local_file)
            elif local_file.endswith(".parquet"):
                return pd.read_parquet(local_file)
            else:
                raise ValueError(f"Unsupported file type: {local_file}")
        except Exception as e:
            # Fallback to sample data if loading fails
            logging.error(f"Couldn't read HF dataset from {HF_INPUT_DATASET_PATH}. Using sample data instead.")
            sample_data = {
                HF_INPUT_DATASET_ID_COLUMN: [f"sample_{i}" for i in range(SAVE_EVERY_N_EXAMPLES)],
                HF_INPUT_DATASET_COLUMN_A: [f"This is sample generation A {i}" for i in range(SAVE_EVERY_N_EXAMPLES)],
                HF_INPUT_DATASET_COLUMN_B: [f"This is sample generation B {i}" for i in range(SAVE_EVERY_N_EXAMPLES)],
            }
            return pd.DataFrame(sample_data)
    
    def get_current_pair(self):
        if self.current_index >= len(self.df):
            return None, None, None
        
        item = self.df.iloc[self.current_index]
        item_id = item.get(HF_INPUT_DATASET_ID_COLUMN, f"item_{self.current_index}")
        left_text = item.get(HF_INPUT_DATASET_COLUMN_A, "")
        right_text = item.get(HF_INPUT_DATASET_COLUMN_B, "")
        
        return item_id, left_text, right_text
    
    def submit_judgment(self, item_id, left_text, right_text, choice):
        if item_id is None:
            return item_id, left_text, right_text, self.current_index
        
        # Record the judgment
        result = {
            "item_id": item_id,
            "generation_a": left_text,
            "generation_b": right_text,
            "judgment": choice,
            "timestamp": datetime.datetime.now().isoformat(),
            "labeler_id": str(uuid.uuid4())[:8]  # Anonymous ID for the labeling session
        }
        
        self.results.append(result)
        
        # Move to next item
        self.current_index += 1
        
        # Save results periodically
        if len(self.results) % SAVE_EVERY_N_EXAMPLES == 0:
            self.save_results()
        
        # Get next pair
        next_id, next_left, next_right = self.get_current_pair()
        return next_id, next_left, next_right, self.current_index
    
    def save_results(self):
        if not self.results:
            return
        
        try:
            # Convert results to dataset format
            results_df = pd.DataFrame(self.results)
            results_df.to_json("temp.jsonl", orient="records", lines=True)
            
            # Push to Hugging Face Hub
            try:
                num_files = len([_ for _ in list_repo_tree(repo_id=HF_OUTPUT_DATASET, repo_type="dataset", path_in_repo=HF_OUTPUT_DATASET_DIR)])
            except Exception as e:
                num_files = 0
            upload_file(repo_id=HF_OUTPUT_DATASET, repo_type="dataset", path_in_repo=os.path.join(HF_OUTPUT_DATASET_DIR, f"results_{num_files+1}.jsonl"), path_or_fileobj="temp.jsonl")
            os.remove("temp.jsonl")
            self.results = []
            logging.info(f"Saved {len(self.results)} results to {HF_OUTPUT_DATASET}")
        except Exception as e:
            logging.error(f"Error saving results: {e}")

# Initialize the labeler
labeler = PairwiseLabeler()

# Get the first pair
initial_id, initial_left, initial_right = labeler.get_current_pair()

with gr.Blocks() as app:
    gr.Markdown(INSTRUCTIONS)
    
    with gr.Row():
        with gr.Column():
            left_output = gr.Textbox(
                value=initial_left,
                label="Model Output A",
                lines=10,
                interactive=False
            )
        
        with gr.Column():
            right_output = gr.Textbox(
                value=initial_right,
                label="Model Output B", 
                lines=10,
                interactive=False
            )
    
    item_id = gr.Textbox(value=initial_id, visible=False)
    
    with gr.Row():
        left_btn = gr.Button("⬅️ A is better", variant="primary")
        right_btn = gr.Button("➡️ B is better", variant="primary")
        tie_btn = gr.Button("🤝 Tie", variant="primary")
        cant_choose_btn = gr.Button("🤔 Can't choose")
    
    current_sample_sld = gr.Slider(minimum=0, maximum=len(labeler), step=1,
                                   value=labeler.current_index,
                                   interactive=False,
                                   label='sample_ind',
                                   info=f"Samples labeled (out of {len(labeler)})",
                                   show_label=False,
                                   container=False,
                                   scale=5)
    
    def judge_left(item_id, left_text, right_text):
        return judge("A is better", item_id, left_text, right_text)
    
    def judge_right(item_id, left_text, right_text):
        return judge("B is better", item_id, left_text, right_text)
    
    def judge_tie(item_id, left_text, right_text):
        return judge("Tie", item_id, left_text, right_text)
    
    def judge_cant_choose(item_id, left_text, right_text):
        return judge("Can't choose", item_id, left_text, right_text)
    
    def judge(choice, item_id, left_text, right_text):
        new_id, new_left, new_right, new_index = labeler.submit_judgment(
            item_id, left_text, right_text, choice
        )
        return new_id, new_left, new_right, new_index
    
    left_btn.click(
        judge_left, 
        inputs=[item_id, left_output, right_output], 
        outputs=[item_id, left_output, right_output, current_sample_sld]
    )
    
    right_btn.click(
        judge_right, 
        inputs=[item_id, left_output, right_output], 
        outputs=[item_id, left_output, right_output, current_sample_sld]
    )
    
    tie_btn.click(
        judge_tie, 
        inputs=[item_id, left_output, right_output], 
        outputs=[item_id, left_output, right_output, current_sample_sld]
    )
    
    cant_choose_btn.click(
        judge_cant_choose, 
        inputs=[item_id, left_output, right_output], 
        outputs=[item_id, left_output, right_output, current_sample_sld]
    )

if __name__ == "__main__":
    app.launch()