Spaces:
Running
on
Zero
Running
on
Zero
| import os | |
| import json | |
| import pandas as pd | |
| import random | |
| import re | |
| from .context_processor import process_highlights | |
| # Global data store - loaded once at import time | |
| _ARENA_DATA = None | |
| def load_arena_data(): | |
| """ | |
| Loads the arena data from the arena_df.csv file in the utils directory. | |
| Returns the data in a format compatible with the application. | |
| """ | |
| global _ARENA_DATA | |
| # If data is already loaded, return it | |
| if _ARENA_DATA is not None: | |
| return _ARENA_DATA | |
| try: | |
| # Define the path to the CSV file | |
| csv_path = os.path.join('utils', 'arena_df.csv') | |
| # Read the CSV file | |
| df = pd.read_csv(csv_path) | |
| print(f"Loaded arena data with {len(df)} examples") | |
| # Store the data globally | |
| _ARENA_DATA = df | |
| return df | |
| except Exception as e: | |
| print(f"Error loading arena data: {e}") | |
| # Return an empty DataFrame if file can't be loaded | |
| return pd.DataFrame() | |
| def create_dummy_example(): | |
| """Creates a dummy example if no data is loaded""" | |
| return { | |
| "question": "Could not load questions from the dataset. Please check the data file.", | |
| "processed_context_desc": "Error: Data not available", | |
| "contexts": ["No context available"], | |
| "full_context": "Error loading context data.", | |
| "Answerable": False, | |
| "insufficient": True | |
| } | |
| def get_random_example(): | |
| """ | |
| Selects a random example from the loaded arena data. | |
| Returns the example data in a format compatible with the application. | |
| """ | |
| # Get the globally stored data - won't reload from disk | |
| df = load_arena_data() | |
| if df.empty: | |
| # If no data is loaded, return a dummy example | |
| return create_dummy_example() | |
| # Select a random row | |
| example = df.sample(1).iloc[0] | |
| # Process the example data | |
| processed_example = { | |
| "question": example['question'], | |
| "processed_context_desc": example.get('processed_context_desc', ''), | |
| "Answerable": example.get('Answerable', True), # Default to True unless specified otherwise | |
| "insufficient": example.get('insufficient', False), | |
| "insufficient_reason": example.get('insufficient_reason', '') | |
| } | |
| # Process contexts - for full context | |
| try: | |
| contexts_raw = example['contexts'] | |
| if isinstance(contexts_raw, str): | |
| contexts = json.loads(contexts_raw) | |
| # Store full contexts as individual items | |
| full_contexts = [] | |
| if isinstance(contexts, list): | |
| for i, chunk in enumerate(contexts): | |
| if isinstance(chunk, dict) and 'content' in chunk: | |
| full_contexts.append({ | |
| 'chunk_num': i + 1, | |
| 'content': chunk.get('content', '') | |
| }) | |
| processed_example["full_contexts"] = full_contexts | |
| else: | |
| processed_example["full_contexts"] = [] | |
| except Exception as e: | |
| print(f"Error processing contexts: {e}") | |
| processed_example["full_contexts"] = [] | |
| # Process highlighted contexts for display | |
| contexts_highlighted = [] | |
| try: | |
| # Check if contexts_highlighted exists | |
| if 'contexts_highlighted' in example and example['contexts_highlighted']: | |
| highlighted_contexts = [] | |
| if isinstance(example['contexts_highlighted'], str): | |
| try: | |
| # Try direct JSON parsing first | |
| raw_str = example['contexts_highlighted'] | |
| # First, manually parse the highlighted contexts using regex | |
| # This is a more robust approach for our specific format | |
| type_pattern = r'"type":\s*"(primary|secondary)"' | |
| content_pattern = r'"abbreviatedContent":\s*"([^"]*)"|"abbreviatedContent":\s*"([^"]*)' | |
| types = re.findall(type_pattern, raw_str) | |
| # Handle both regular quotes and escaped quotes in content | |
| raw_contents = re.findall(content_pattern, raw_str) | |
| # Extract contents from tuple matches (the regex has capture groups) | |
| contents = [] | |
| for match in raw_contents: | |
| # Get the non-empty string from the tuple | |
| content = next((s for s in match if s), "") | |
| contents.append(content) | |
| # Create the highlighted contexts from extracted data | |
| for i, (ctx_type, content) in enumerate(zip(types, contents)): | |
| highlighted_contexts.append({ | |
| 'type': ctx_type, | |
| 'abbreviatedContent': content | |
| }) | |
| except Exception as e: | |
| print(f"Error extracting contexts with regex: {e}") | |
| else: | |
| # Already an object, not a string | |
| highlighted_contexts = example['contexts_highlighted'] | |
| # Process each context item | |
| for i, item in enumerate(highlighted_contexts): | |
| if isinstance(item, dict): | |
| ctx_type = item.get('type', 'secondary') | |
| content = item.get('abbreviatedContent', '') | |
| # Process highlights using the standard format | |
| content = process_highlights(content) | |
| contexts_highlighted.append({ | |
| 'chunk_num': i + 1, | |
| 'content': content, | |
| 'is_primary': ctx_type == 'primary' | |
| }) | |
| except Exception as e: | |
| print(f"Error processing highlighted contexts: {e}") | |
| # If we couldn't process the highlighted contexts, fall back to the full contexts | |
| if not contexts_highlighted and processed_example["full_contexts"]: | |
| for i, ctx in enumerate(processed_example["full_contexts"]): | |
| contexts_highlighted.append({ | |
| 'chunk_num': i + 1, | |
| 'content': ctx.get('content', ''), | |
| 'is_primary': False | |
| }) | |
| processed_example["contexts"] = contexts_highlighted | |
| return processed_example | |
| def get_random_example_and_models(model_names): | |
| """ | |
| Selects a random example from the arena data and assigns two distinct | |
| random models to positions A and B. | |
| """ | |
| example = get_random_example() | |
| # Choose two different models from the model list | |
| model_a_name, model_b_name = random.sample(model_names, 2) | |
| return example, model_a_name, model_b_name |