Spaces:

jablonkagroup
/

eval-cards-gallery

Sleeping

App Files Files Community

eval-cards-gallery / app.py

n0w0f

feat: token in env

9925f02 4 months ago

raw

history blame

15.5 kB

	import datetime
	import os
	import re
	from pathlib import Path

	import gradio as gr
	import pandas as pd
	import requests
	import yaml

	# Constants
	EVAL_CARDS_DIR = "eval_cards"
	TEMPLATE_PATH = "template.yaml"
	DEFAULT_MODEL = "anthropic/claude-3-haiku-20240307" # Or any other model available on HF

	# Ensure the eval cards directory exists
	os.makedirs(EVAL_CARDS_DIR, exist_ok=True)

	# Copy the template to the appropriate location
	with open("template.yaml", "w") as f:
	with open("yaml_template.yaml", "r") as template_file:
	f.write(template_file.read())

	def load_template():
	"""Load the YAML template"""
	with open(TEMPLATE_PATH, "r") as file:
	return file.read()

	def yaml_to_dict(yaml_str):
	"""Convert YAML string to Python dictionary"""
	try:
	return yaml.safe_load(yaml_str)
	except yaml.YAMLError as e:
	return {"error": str(e)}

	def compute_coverage_score(eval_data):
	"""
	Compute a coverage score for the eval card
	Returns a score from 0-100 and a breakdown of coverage by section
	"""
	sections = {
	"metadata": 5,
	"evaluation_design": 10,
	"estimand": 20,
	"estimator": 20,
	"estimate": 20,
	"results_communication": 10,
	"known_issues_and_limitations": 10,
	"version_and_maintenance": 5,
	"citation_and_usage": 5,
	}

	scores = {}
	total_score = 0

	def count_filled_fields(data, prefix=""):
	if isinstance(data, dict):
	filled = 0
	total = 0
	for key, value in data.items():
	if isinstance(value, (dict, list)):
	sub_filled, sub_total = count_filled_fields(value, f"{prefix}.{key}" if prefix else key)
	filled += sub_filled
	total += sub_total
	else:
	total += 1
	if value and not (isinstance(value, str) and value.strip() in ["", "[]", "{}"]):
	filled += 1
	return filled, total
	elif isinstance(data, list):
	if not data:
	return 0, 1
	filled = 0
	total = 0
	for item in data:
	sub_filled, sub_total = count_filled_fields(item)
	filled += sub_filled
	total += sub_total
	return filled, total
	else:
	return 1 if data else 0, 1

	# Compute scores for each section
	for section, weight in sections.items():
	if section in eval_data:
	filled, total = count_filled_fields(eval_data[section])
	completion_rate = filled / total if total > 0 else 0
	scores[section] = {
	"score": round(completion_rate * weight, 2),
	"max_score": weight,
	"completion_rate": round(completion_rate * 100, 2),
	"fields_filled": filled,
	"fields_total": total
	}
	total_score += scores[section]["score"]
	else:
	scores[section] = {
	"score": 0,
	"max_score": weight,
	"completion_rate": 0,
	"fields_filled": 0,
	"fields_total": 0
	}

	return round(total_score, 2), scores

	def get_llm_feedback(yaml_content, api_token=None):
	"""
	Get feedback on the eval card from Groq's LLM
	Uses GROQ_API_KEY from environment variables if no token is provided
	"""
	import os
	import requests
	from dotenv import load_dotenv

	# Load environment variables from .env file if it exists
	load_dotenv()

	# Use provided token or get from environment
	api_token = api_token or os.environ.get("GROQ_API_KEY")

	if not api_token:
	return "API token is required for LLM feedback. Please set the GROQ_API_KEY environment variable or provide a token."

	try:
	headers = {
	"Content-Type": "application/json",
	"Authorization": f"Bearer {api_token}"
	}

	prompt = f"""
	I'm reviewing an Evaluation Card in YAML format. Please analyze it for completeness,
	consistency, and clarity. Provide specific recommendations for improvement.

	Focus on:
	1. Sections that need more detail
	2. Inconsistencies or contradictions
	3. Clarity of language and explanations
	4. Alignment with best practices for ML evaluation

	Here's the YAML content:

	```yaml
	{yaml_content}
	```

	Provide your feedback in a structured format with specific, actionable recommendations.
	"""

	payload = {
	"model": "llama-3.3-70b-versatile", # or another groq supported model
	"messages": [
	{"role": "user", "content": prompt}
	]
	}

	response = requests.post(
	"https://api.groq.com/openai/v1/chat/completions",
	headers=headers,
	json=payload
	)

	if response.status_code == 200:
	return response.json()["choices"][0]["message"]["content"]
	else:
	return f"Error getting Groq LLM feedback: {response.status_code} - {response.text}"

	except Exception as e:
	return f"Error getting Groq LLM feedback: {str(e)}"


	def save_eval_card(yaml_content, filename=None):
	"""Save an eval card to the repository"""
	try:
	# Parse YAML to validate it
	eval_data = yaml.safe_load(yaml_content)

	# Generate filename if not provided
	if not filename:
	eval_name = eval_data.get("title", "Unnamed Evaluation")
	# Clean filename
	filename = re.sub(r'[^\w\-_]', '_', eval_name)
	filename = f"{filename}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.yaml"

	# Save file
	file_path = os.path.join(EVAL_CARDS_DIR, filename)
	with open(file_path, "w") as file:
	file.write(yaml_content)

	return True, file_path
	except Exception as e:
	return False, str(e)

	def load_all_eval_cards():
	"""Load all eval cards from the repository"""
	eval_cards = []

	for filename in os.listdir(EVAL_CARDS_DIR):
	if filename.endswith(".yaml"):
	file_path = os.path.join(EVAL_CARDS_DIR, filename)
	try:
	with open(file_path, "r") as file:
	yaml_content = file.read()
	eval_data = yaml.safe_load(yaml_content)

	# Compute coverage score
	score, score_details = compute_coverage_score(eval_data)

	# Extract key metadata
	eval_cards.append({
	"filename": filename,
	"title": eval_data.get("title", "Unnamed Evaluation"),
	"summary": eval_data.get("summary", ""),
	"authors": ", ".join(eval_data.get("metadata", {}).get("authors", [])),
	"creation_date": eval_data.get("metadata", {}).get("creation_date", ""),
	"coverage_score": score,
	"score_details": score_details,
	"yaml_content": yaml_content,
	"data": eval_data
	})
	except Exception as e:
	print(f"Error loading {filename}: {str(e)}")

	return eval_cards

	def format_eval_card_as_html(eval_card):
	"""Format an eval card as HTML for display"""
	html = f"""
	<div style="border: 1px solid #ddd; padding: 15px; margin-bottom: 20px; border-radius: 5px;">
	<h3>{eval_card['title']}</h3>
	<p>{eval_card['summary']}</p>
	<p><strong>Authors:</strong> {eval_card['authors']}</p>
	<p><strong>Created:</strong> {eval_card['creation_date']}</p>
	<p><strong>Coverage Score:</strong> {eval_card['coverage_score']}%</p>

	<h4>Coverage by Section:</h4>
	<table style="width: 100%; border-collapse: collapse;">
	<tr>
	<th style="text-align: left; padding: 5px; border-bottom: 1px solid #ddd;">Section</th>
	<th style="text-align: right; padding: 5px; border-bottom: 1px solid #ddd;">Score</th>
	<th style="text-align: right; padding: 5px; border-bottom: 1px solid #ddd;">Completion</th>
	</tr>
	"""

	for section, details in eval_card['score_details'].items():
	html += f"""
	<tr>
	<td style="padding: 5px; border-bottom: 1px solid #eee;">{section}</td>
	<td style="text-align: right; padding: 5px; border-bottom: 1px solid #eee;">{details['score']}/{details['max_score']}</td>
	<td style="text-align: right; padding: 5px; border-bottom: 1px solid #eee;">{details['completion_rate']}%</td>
	</tr>
	"""

	html += """
	</table>
	<div style="margin-top: 10px;">
	<a href="#" onclick="viewYaml(this)" data-filename="{eval_card['filename']}" style="text-decoration: none; color: #3273dc;">View YAML</a>
	</div>
	</div>
	"""

	return html

	def create_eval_cards_table(eval_cards):
	"""Create an HTML table of eval cards"""
	if not eval_cards:
	return "<p>No evaluation cards found.</p>"

	# Sort by coverage score (highest first)
	eval_cards.sort(key=lambda x: x['coverage_score'], reverse=True)

	html = ""
	for eval_card in eval_cards:
	html += format_eval_card_as_html(eval_card)

	return html

	def upload_file(file):
	"""Process an uploaded YAML file"""
	if file is None:
	return "No file uploaded", None

	try:
	yaml_content = file.decode("utf-8")
	# Validate YAML
	eval_data = yaml.safe_load(yaml_content)
	return yaml_content, eval_data
	except Exception as e:
	return f"Error processing file: {str(e)}", None

	def get_feedback(yaml_content):
	"""Get LLM feedback on the eval card"""
	if not yaml_content:
	return "Please upload or paste a YAML file first."

	# Use provided token or get from environment
	api_token = os.environ.get("GROQ_API_KEY")

	if not api_token:
	return "Please provide an API token or set the GROQ_API_KEY environment variable."

	feedback = get_llm_feedback(yaml_content, api_token)
	return feedback

	def submit_eval_card(yaml_content):
	"""Submit an eval card to the repository"""
	if not yaml_content:
	return "Please upload or paste a YAML file first.", None, None

	try:
	# Validate YAML
	eval_data = yaml.safe_load(yaml_content)

	# Compute coverage score
	score, score_details = compute_coverage_score(eval_data)

	# Save eval card
	success, file_path = save_eval_card(yaml_content)

	if success:
	return f"Evaluation card saved successfully! Coverage score: {score}%", score, score_details
	else:
	return f"Error saving evaluation card: {file_path}", None, None

	except Exception as e:
	return f"Error processing evaluation card: {str(e)}", None, None

	def refresh_gallery():
	"""Refresh the gallery of eval cards"""
	eval_cards = load_all_eval_cards()
	html = create_eval_cards_table(eval_cards)

	# Convert data to pandas DataFrame for table view
	table_data = []
	for card in eval_cards:
	table_data.append({
	"Title": card["title"],
	"Authors": card["authors"],
	"Creation Date": card["creation_date"],
	"Coverage Score": f"{card['coverage_score']}%"
	})

	df = pd.DataFrame(table_data)

	return html, df if not df.empty else None

	def handle_upload_tab(file_obj, yaml_text):
	"""Handle upload tab actions - either use uploaded file or pasted text"""
	if file_obj is not None:
	yaml_content, eval_data = upload_file(file_obj)
	return yaml_content
	else:
	return yaml_text

	# Create the Gradio interface
	with gr.Blocks(title="Evaluation Card Repository") as app:
	with gr.Row():
	with gr.Column(scale=2):
	gr.Markdown("# Evaluation Card Repository")
	gr.Markdown("""
	This application allows you to upload, validate, and explore ML evaluation cards.

	Upload your evaluation card in YAML format, get feedback from an LLM, and submit it to the repository.
	""")

	with gr.Tabs():
	with gr.TabItem("Upload & Review"):
	with gr.Row():
	with gr.Column():
	file_upload = gr.File(label="Upload YAML File", file_types=[".yaml", ".yml"])

	with gr.Accordion("Or paste YAML content", open=False):
	yaml_input = gr.TextArea(label="YAML Content", placeholder="Paste your YAML content here...", lines=10)

	load_template_btn = gr.Button("Load Template")

	# api_token = gr.Textbox(label="API Token (for LLM feedback)", type="password")

	with gr.Row():
	get_feedback_btn = gr.Button("Get LLM Feedback")
	submit_btn = gr.Button("Submit Evaluation Card", variant="primary")

	with gr.Column():
	yaml_display = gr.TextArea(label="Current YAML", lines=20)

	with gr.Accordion("LLM Feedback", open=True):
	feedback_display = gr.Markdown()

	with gr.Accordion("Submission Result", open=True):
	result_display = gr.Markdown()
	coverage_score = gr.Number(label="Coverage Score", visible=False)
	coverage_details = gr.JSON(label="Coverage Details", visible=False)

	with gr.TabItem("Gallery"):
	refresh_btn = gr.Button("Refresh Gallery")

	with gr.Tabs():
	with gr.TabItem("Card View"):
	gallery_html = gr.HTML()

	with gr.TabItem("Table View"):
	gallery_table = gr.DataFrame()

	# Set up event handlers
	load_template_btn.click(
	fn=load_template,
	outputs=[yaml_display]
	)

	file_upload.change(
	fn=handle_upload_tab,
	inputs=[file_upload, yaml_input],
	outputs=[yaml_display]
	)

	yaml_input.change(
	fn=lambda x: x,
	inputs=[yaml_input],
	outputs=[yaml_display]
	)

	get_feedback_btn.click(
	fn=get_feedback,
	inputs=[yaml_display],
	outputs=[feedback_display]
	)

	submit_btn.click(
	fn=submit_eval_card,
	inputs=[yaml_display],
	outputs=[result_display, coverage_score, coverage_details]
	)

	refresh_btn.click(
	fn=refresh_gallery,
	outputs=[gallery_html, gallery_table]
	)

	# Initialize the gallery on app start
	app.load(
	fn=refresh_gallery,
	outputs=[gallery_html, gallery_table]
	)

	# Launch the app
	if __name__ == "__main__":
	app.launch()