File size: 7,943 Bytes
f053717
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
import gradio as gr
import pandas as pd
import os
import uuid
import datetime
import logging
from huggingface_hub import hf_hub_download, upload_file, list_repo_tree
from dotenv import load_dotenv

load_dotenv()

# Configuration
HF_INPUT_DATASET = os.getenv("HF_INPUT_DATASET")
HF_INPUT_DATASET_PATH = os.getenv("HF_INPUT_DATASET_PATH")
HF_INPUT_DATASET_ID_COLUMN = os.getenv("HF_INPUT_DATASET_ID_COLUMN")
HF_INPUT_DATASET_COLUMN_A = os.getenv("HF_INPUT_DATASET_COLUMN_A")
HF_INPUT_DATASET_COLUMN_B = os.getenv("HF_INPUT_DATASET_COLUMN_B")
HF_OUTPUT_DATASET = os.getenv("HF_OUTPUT_DATASET")
HF_OUTPUT_DATASET_DIR = os.getenv("HF_OUTPUT_DATASET_DIR")
INSTRUCTIONS = """
# Pairwise Model Output Labeling

Please compare the two model outputs shown below and select which one you think is better.
- Choose "Left is better" if the left output is superior
- Choose "Right is better" if the right output is superior
- Choose "Tie" if they are equally good or bad
- Choose "Can't choose" if you cannot make a determination
"""
SAVE_EVERY_N_EXAMPLES = 5


class PairwiseLabeler:
    def __init__(self):
        self.current_index = 0
        self.results = []
        self.df = self.read_hf_dataset()
    
    def __len__(self):
        return len(self.df)

    def read_hf_dataset(self) -> pd.DataFrame:
        try:
            local_file = hf_hub_download(repo_id=HF_INPUT_DATASET, repo_type="dataset", filename=HF_INPUT_DATASET_PATH)
            if local_file.endswith(".json"):
                return pd.read_json(local_file)
            elif local_file.endswith(".jsonl"):
                return pd.read_json(local_file, orient="records",lines=True)
            elif local_file.endswith(".csv"):
                return pd.read_csv(local_file)
            elif local_file.endswith(".parquet"):
                return pd.read_parquet(local_file)
            else:
                raise ValueError(f"Unsupported file type: {local_file}")
        except Exception as e:
            # Fallback to sample data if loading fails
            logging.error(f"Couldn't read HF dataset from {HF_INPUT_DATASET_PATH}. Using sample data instead.")
            sample_data = {
                HF_INPUT_DATASET_ID_COLUMN: [f"sample_{i}" for i in range(SAVE_EVERY_N_EXAMPLES)],
                HF_INPUT_DATASET_COLUMN_A: [f"This is sample generation A {i}" for i in range(SAVE_EVERY_N_EXAMPLES)],
                HF_INPUT_DATASET_COLUMN_B: [f"This is sample generation B {i}" for i in range(SAVE_EVERY_N_EXAMPLES)],
            }
            return pd.DataFrame(sample_data)
    
    def get_current_pair(self):
        if self.current_index >= len(self.df):
            return None, None, None
        
        item = self.df.iloc[self.current_index]
        item_id = item.get(HF_INPUT_DATASET_ID_COLUMN, f"item_{self.current_index}")
        left_text = item.get(HF_INPUT_DATASET_COLUMN_A, "")
        right_text = item.get(HF_INPUT_DATASET_COLUMN_B, "")
        
        return item_id, left_text, right_text
    
    def submit_judgment(self, item_id, left_text, right_text, choice):
        if item_id is None:
            return item_id, left_text, right_text, self.current_index
        
        # Record the judgment
        result = {
            "item_id": item_id,
            "generation_a": left_text,
            "generation_b": right_text,
            "judgment": choice,
            "timestamp": datetime.datetime.now().isoformat(),
            "labeler_id": str(uuid.uuid4())[:8]  # Anonymous ID for the labeling session
        }
        
        self.results.append(result)
        
        # Move to next item
        self.current_index += 1
        
        # Save results periodically
        if len(self.results) % SAVE_EVERY_N_EXAMPLES == 0:
            self.save_results()
        
        # Get next pair
        next_id, next_left, next_right = self.get_current_pair()
        return next_id, next_left, next_right, self.current_index
    
    def save_results(self):
        if not self.results:
            return
        
        try:
            # Convert results to dataset format
            results_df = pd.DataFrame(self.results)
            results_df.to_json("temp.jsonl", orient="records", lines=True)
            
            # Push to Hugging Face Hub
            try:
                num_files = len([_ for _ in list_repo_tree(repo_id=HF_OUTPUT_DATASET, repo_type="dataset", path_in_repo=HF_OUTPUT_DATASET_DIR)])
            except Exception as e:
                num_files = 0
            upload_file(repo_id=HF_OUTPUT_DATASET, repo_type="dataset", path_in_repo=os.path.join(HF_OUTPUT_DATASET_DIR, f"results_{num_files+1}.jsonl"), path_or_fileobj="temp.jsonl")
            os.remove("temp.jsonl")
            self.results = []
            logging.info(f"Saved {len(self.results)} results to {HF_OUTPUT_DATASET}")
        except Exception as e:
            logging.error(f"Error saving results: {e}")

# Initialize the labeler
labeler = PairwiseLabeler()

# Get the first pair
initial_id, initial_left, initial_right = labeler.get_current_pair()

with gr.Blocks() as app:
    gr.Markdown(INSTRUCTIONS)
    
    with gr.Row():
        with gr.Column():
            left_output = gr.Textbox(
                value=initial_left,
                label="Model Output A",
                lines=10,
                interactive=False
            )
        
        with gr.Column():
            right_output = gr.Textbox(
                value=initial_right,
                label="Model Output B", 
                lines=10,
                interactive=False
            )
    
    item_id = gr.Textbox(value=initial_id, visible=False)
    
    with gr.Row():
        left_btn = gr.Button("⬅️ A is better", variant="primary")
        right_btn = gr.Button("➡️ B is better", variant="primary")
        tie_btn = gr.Button("🤝 Tie", variant="primary")
        cant_choose_btn = gr.Button("🤔 Can't choose")
    
    current_sample_sld = gr.Slider(minimum=0, maximum=len(labeler), step=1,
                                   value=labeler.current_index,
                                   interactive=False,
                                   label='sample_ind',
                                   info=f"Samples labeled (out of {len(labeler)})",
                                   show_label=False,
                                   container=False,
                                   scale=5)
    
    def judge_left(item_id, left_text, right_text):
        return judge("A is better", item_id, left_text, right_text)
    
    def judge_right(item_id, left_text, right_text):
        return judge("B is better", item_id, left_text, right_text)
    
    def judge_tie(item_id, left_text, right_text):
        return judge("Tie", item_id, left_text, right_text)
    
    def judge_cant_choose(item_id, left_text, right_text):
        return judge("Can't choose", item_id, left_text, right_text)
    
    def judge(choice, item_id, left_text, right_text):
        new_id, new_left, new_right, new_index = labeler.submit_judgment(
            item_id, left_text, right_text, choice
        )
        return new_id, new_left, new_right, new_index
    
    left_btn.click(
        judge_left, 
        inputs=[item_id, left_output, right_output], 
        outputs=[item_id, left_output, right_output, current_sample_sld]
    )
    
    right_btn.click(
        judge_right, 
        inputs=[item_id, left_output, right_output], 
        outputs=[item_id, left_output, right_output, current_sample_sld]
    )
    
    tie_btn.click(
        judge_tie, 
        inputs=[item_id, left_output, right_output], 
        outputs=[item_id, left_output, right_output, current_sample_sld]
    )
    
    cant_choose_btn.click(
        judge_cant_choose, 
        inputs=[item_id, left_output, right_output], 
        outputs=[item_id, left_output, right_output, current_sample_sld]
    )

if __name__ == "__main__":
    app.launch()