Spaces:
Running
on
Zero
Running
on
Zero
| import os | |
| import json | |
| import pandas as pd | |
| import random | |
| import re | |
| # Global data store - loaded once at import time | |
| _ARENA_DATA = None | |
| def load_arena_data(): | |
| """ | |
| Loads the arena data from the arena_df.csv file in the utils directory. | |
| Returns the data in a format compatible with the application. | |
| """ | |
| global _ARENA_DATA | |
| # If data is already loaded, return it | |
| if _ARENA_DATA is not None: | |
| return _ARENA_DATA | |
| try: | |
| # Define the path to the CSV file | |
| csv_path = os.path.join('utils', 'arena_df.csv') | |
| # Read the CSV file | |
| df = pd.read_csv(csv_path) | |
| print(f"Loaded arena data with {len(df)} examples") | |
| # Store the data globally | |
| _ARENA_DATA = df | |
| return df | |
| except Exception as e: | |
| print(f"Error loading arena data: {e}") | |
| # Return an empty DataFrame if file can't be loaded | |
| return pd.DataFrame() | |
| def create_dummy_example(): | |
| """Creates a dummy example if no data is loaded""" | |
| return { | |
| "question": "Could not load questions from the dataset. Please check the data file.", | |
| "processed_context_desc": "Error: Data not available", | |
| "contexts": [], | |
| "full_contexts": [], | |
| "Answerable": False, | |
| "insufficient": True, | |
| "insufficient_reason": "Data loading error", | |
| "answer": "No reference answer available due to data loading error." | |
| } | |
| def get_random_example(): | |
| """ | |
| Selects a random example from the loaded arena data. | |
| Returns the example data in a format compatible with the application. | |
| """ | |
| # Get the globally stored data - won't reload from disk | |
| df = load_arena_data() | |
| if df.empty: | |
| # If no data is loaded, return a dummy example | |
| return create_dummy_example() | |
| # Select a random row | |
| example = df.sample(1).iloc[0] | |
| # Process the example data | |
| processed_example = { | |
| "question": example['question'], | |
| "Answerable": not example.get('insufficient', False), | |
| "insufficient": example.get('insufficient', False), | |
| "insufficient_reason": example.get('insufficient_reason', ''), | |
| "sample_id": example.get('sample_id', 0), | |
| "answer": example.get('answer', '') # NEW: Extract reference answer from CSV | |
| } | |
| # Process the context description - ensure it's a non-empty string | |
| context_desc = example.get('processed_context_desc', '') | |
| if pd.isna(context_desc): | |
| context_desc = "" | |
| # Add the description to the processed example | |
| processed_example["processed_context_desc"] = context_desc | |
| # Process full contexts - from the 'contexts' column | |
| full_contexts = [] | |
| try: | |
| if 'contexts' in example and example['contexts']: | |
| # Try to parse contexts as JSON if it's a string | |
| contexts_str = example['contexts'] | |
| if isinstance(contexts_str, str): | |
| # Try to parse as list literal first (for Python list representation) | |
| if contexts_str.strip().startswith('[') and contexts_str.strip().endswith(']'): | |
| try: | |
| # This is for handling Python list literals like "['string1', 'string2']" | |
| import ast | |
| contexts_list = ast.literal_eval(contexts_str) | |
| # Process each context string in the list | |
| for ctx in contexts_list: | |
| full_contexts.append(ctx) | |
| except (SyntaxError, ValueError) as e: | |
| # If ast.literal_eval fails, try JSON | |
| try: | |
| contexts_list = json.loads(contexts_str) | |
| # Process each context in the list | |
| for ctx in contexts_list: | |
| if isinstance(ctx, str): | |
| full_contexts.append(ctx) | |
| elif isinstance(ctx, dict) and 'content' in ctx: | |
| full_contexts.append(ctx.get('content', '')) | |
| except json.JSONDecodeError: | |
| # Not valid JSON, treat as a single context | |
| full_contexts.append(contexts_str) | |
| else: | |
| # Single context string (not JSON array or list literal) | |
| full_contexts.append(contexts_str) | |
| elif isinstance(contexts_str, list): | |
| # Already a list, process directly | |
| for ctx in contexts_str: | |
| if isinstance(ctx, str): | |
| full_contexts.append(ctx) | |
| elif isinstance(ctx, dict) and 'content' in ctx: | |
| full_contexts.append(ctx.get('content', '')) | |
| except Exception as e: | |
| print(f"Error processing full contexts: {e}") | |
| # Process highlighted contexts - from contexts_highlighted column | |
| contexts_highlighted = [] | |
| try: | |
| # Process contexts_highlighted - this is stored as a string in CSV | |
| if 'contexts_highlighted' in example and example['contexts_highlighted']: | |
| highlights_str = example['contexts_highlighted'] | |
| if isinstance(highlights_str, str): | |
| try: | |
| # Try to parse as JSON array | |
| highlights_list = json.loads(highlights_str) | |
| # Process each highlighted context | |
| for i, ctx in enumerate(highlights_list): | |
| if isinstance(ctx, dict): | |
| ctx_type = ctx.get('type', 'secondary') | |
| content = ctx.get('abbreviatedContent', '') | |
| # The content already has HTML span tags for highlights | |
| contexts_highlighted.append({ | |
| 'is_primary': ctx_type == 'primary', | |
| 'content': content | |
| }) | |
| except json.JSONDecodeError: | |
| print(f"Error parsing contexts_highlighted JSON: {highlights_str[:100]}...") | |
| elif isinstance(highlights_str, list): | |
| # Already a list, process directly | |
| for ctx in highlights_str: | |
| if isinstance(ctx, dict): | |
| ctx_type = ctx.get('type', 'secondary') | |
| content = ctx.get('abbreviatedContent', '') | |
| contexts_highlighted.append({ | |
| 'is_primary': ctx_type == 'primary', | |
| 'content': content | |
| }) | |
| except Exception as e: | |
| print(f"Error processing highlighted contexts: {e}") | |
| # Make sure we have the highlighted contexts populated even if there are no contexts_highlighted | |
| if not contexts_highlighted and full_contexts: | |
| for content in full_contexts: | |
| contexts_highlighted.append({ | |
| 'is_primary': False, | |
| 'content': content | |
| }) | |
| processed_example["contexts"] = contexts_highlighted | |
| processed_example["full_contexts"] = full_contexts | |
| return processed_example | |
| def get_random_example_and_models(model_names): | |
| """ | |
| Selects a random example from the arena data and assigns two distinct | |
| random models to positions A and B. | |
| """ | |
| example = get_random_example() | |
| # Choose two different models from the model list | |
| model_a_name, model_b_name = random.sample(model_names, 2) | |
| return example, model_a_name, model_b_name |