SLM-RAG-Arena

Running on Zero

App Files Files Community

SLM-RAG-Arena / utils /data_loader.py

oliver-aizip

update data loading (#1)

f85a3ff verified 6 months ago

raw

history blame

6.87 kB

	import os
	import json
	import pandas as pd
	import random
	import re
	from .context_processor import process_highlights

	# Global data store - loaded once at import time
	_ARENA_DATA = None

	def load_arena_data():
	"""
	Loads the arena data from the arena_df.csv file in the utils directory.
	Returns the data in a format compatible with the application.
	"""
	global _ARENA_DATA

	# If data is already loaded, return it
	if _ARENA_DATA is not None:
	return _ARENA_DATA

	try:
	# Define the path to the CSV file
	csv_path = os.path.join('utils', 'arena_df.csv')

	# Read the CSV file
	df = pd.read_csv(csv_path)
	print(f"Loaded arena data with {len(df)} examples")

	# Store the data globally
	_ARENA_DATA = df
	return df
	except Exception as e:
	print(f"Error loading arena data: {e}")
	# Return an empty DataFrame if file can't be loaded
	return pd.DataFrame()

	def create_dummy_example():
	"""Creates a dummy example if no data is loaded"""
	return {
	"question": "Could not load questions from the dataset. Please check the data file.",
	"processed_context_desc": "Error: Data not available",
	"contexts": ["No context available"],
	"full_context": "Error loading context data.",
	"Answerable": False,
	"insufficient": True
	}

	def get_random_example():
	"""
	Selects a random example from the loaded arena data.
	Returns the example data in a format compatible with the application.
	"""
	# Get the globally stored data - won't reload from disk
	df = load_arena_data()

	if df.empty:
	# If no data is loaded, return a dummy example
	return create_dummy_example()

	# Select a random row
	example = df.sample(1).iloc[0]

	# Process the example data
	processed_example = {
	"question": example['question'],
	"processed_context_desc": example.get('processed_context_desc', ''),
	"Answerable": example.get('Answerable', True), # Default to True unless specified otherwise
	"insufficient": example.get('insufficient', False),
	"insufficient_reason": example.get('insufficient_reason', '')
	}

	# Process contexts - for full context
	try:
	contexts_raw = example['contexts']
	if isinstance(contexts_raw, str):
	contexts = json.loads(contexts_raw)
	# Store full contexts as individual items
	full_contexts = []
	if isinstance(contexts, list):
	for i, chunk in enumerate(contexts):
	if isinstance(chunk, dict) and 'content' in chunk:
	full_contexts.append({
	'chunk_num': i + 1,
	'content': chunk.get('content', '')
	})
	processed_example["full_contexts"] = full_contexts
	else:
	processed_example["full_contexts"] = []
	except Exception as e:
	print(f"Error processing contexts: {e}")
	processed_example["full_contexts"] = []

	# Process highlighted contexts for display
	contexts_highlighted = []

	try:
	# Check if contexts_highlighted exists
	if 'contexts_highlighted' in example and example['contexts_highlighted']:
	highlighted_contexts = []

	if isinstance(example['contexts_highlighted'], str):
	try:
	# Try direct JSON parsing first
	raw_str = example['contexts_highlighted']

	# First, manually parse the highlighted contexts using regex
	# This is a more robust approach for our specific format
	type_pattern = r'"type":\s*"(primary\|secondary)"'
	content_pattern = r'"abbreviatedContent":\s"([^"])"\|"abbreviatedContent":\s"([^"])'

	types = re.findall(type_pattern, raw_str)
	# Handle both regular quotes and escaped quotes in content
	raw_contents = re.findall(content_pattern, raw_str)

	# Extract contents from tuple matches (the regex has capture groups)
	contents = []
	for match in raw_contents:
	# Get the non-empty string from the tuple
	content = next((s for s in match if s), "")
	contents.append(content)

	# Create the highlighted contexts from extracted data
	for i, (ctx_type, content) in enumerate(zip(types, contents)):
	highlighted_contexts.append({
	'type': ctx_type,
	'abbreviatedContent': content
	})

	except Exception as e:
	print(f"Error extracting contexts with regex: {e}")
	else:
	# Already an object, not a string
	highlighted_contexts = example['contexts_highlighted']

	# Process each context item
	for i, item in enumerate(highlighted_contexts):
	if isinstance(item, dict):
	ctx_type = item.get('type', 'secondary')
	content = item.get('abbreviatedContent', '')

	# Process highlights using the standard format
	content = process_highlights(content)

	contexts_highlighted.append({
	'chunk_num': i + 1,
	'content': content,
	'is_primary': ctx_type == 'primary'
	})
	except Exception as e:
	print(f"Error processing highlighted contexts: {e}")

	# If we couldn't process the highlighted contexts, fall back to the full contexts
	if not contexts_highlighted and processed_example["full_contexts"]:
	for i, ctx in enumerate(processed_example["full_contexts"]):
	contexts_highlighted.append({
	'chunk_num': i + 1,
	'content': ctx.get('content', ''),
	'is_primary': False
	})

	processed_example["contexts"] = contexts_highlighted

	return processed_example

	def get_random_example_and_models(model_names):
	"""
	Selects a random example from the arena data and assigns two distinct
	random models to positions A and B.
	"""
	example = get_random_example()
	# Choose two different models from the model list
	model_a_name, model_b_name = random.sample(model_names, 2)
	return example, model_a_name, model_b_name