n0w0f's picture
feat: token in env
9925f02
raw
history blame
15.5 kB
import datetime
import os
import re
from pathlib import Path
import gradio as gr
import pandas as pd
import requests
import yaml
# Constants
EVAL_CARDS_DIR = "eval_cards"
TEMPLATE_PATH = "template.yaml"
DEFAULT_MODEL = "anthropic/claude-3-haiku-20240307" # Or any other model available on HF
# Ensure the eval cards directory exists
os.makedirs(EVAL_CARDS_DIR, exist_ok=True)
# Copy the template to the appropriate location
with open("template.yaml", "w") as f:
with open("yaml_template.yaml", "r") as template_file:
f.write(template_file.read())
def load_template():
"""Load the YAML template"""
with open(TEMPLATE_PATH, "r") as file:
return file.read()
def yaml_to_dict(yaml_str):
"""Convert YAML string to Python dictionary"""
try:
return yaml.safe_load(yaml_str)
except yaml.YAMLError as e:
return {"error": str(e)}
def compute_coverage_score(eval_data):
"""
Compute a coverage score for the eval card
Returns a score from 0-100 and a breakdown of coverage by section
"""
sections = {
"metadata": 5,
"evaluation_design": 10,
"estimand": 20,
"estimator": 20,
"estimate": 20,
"results_communication": 10,
"known_issues_and_limitations": 10,
"version_and_maintenance": 5,
"citation_and_usage": 5,
}
scores = {}
total_score = 0
def count_filled_fields(data, prefix=""):
if isinstance(data, dict):
filled = 0
total = 0
for key, value in data.items():
if isinstance(value, (dict, list)):
sub_filled, sub_total = count_filled_fields(value, f"{prefix}.{key}" if prefix else key)
filled += sub_filled
total += sub_total
else:
total += 1
if value and not (isinstance(value, str) and value.strip() in ["", "[]", "{}"]):
filled += 1
return filled, total
elif isinstance(data, list):
if not data:
return 0, 1
filled = 0
total = 0
for item in data:
sub_filled, sub_total = count_filled_fields(item)
filled += sub_filled
total += sub_total
return filled, total
else:
return 1 if data else 0, 1
# Compute scores for each section
for section, weight in sections.items():
if section in eval_data:
filled, total = count_filled_fields(eval_data[section])
completion_rate = filled / total if total > 0 else 0
scores[section] = {
"score": round(completion_rate * weight, 2),
"max_score": weight,
"completion_rate": round(completion_rate * 100, 2),
"fields_filled": filled,
"fields_total": total
}
total_score += scores[section]["score"]
else:
scores[section] = {
"score": 0,
"max_score": weight,
"completion_rate": 0,
"fields_filled": 0,
"fields_total": 0
}
return round(total_score, 2), scores
def get_llm_feedback(yaml_content, api_token=None):
"""
Get feedback on the eval card from Groq's LLM
Uses GROQ_API_KEY from environment variables if no token is provided
"""
import os
import requests
from dotenv import load_dotenv
# Load environment variables from .env file if it exists
load_dotenv()
# Use provided token or get from environment
api_token = api_token or os.environ.get("GROQ_API_KEY")
if not api_token:
return "API token is required for LLM feedback. Please set the GROQ_API_KEY environment variable or provide a token."
try:
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {api_token}"
}
prompt = f"""
I'm reviewing an Evaluation Card in YAML format. Please analyze it for completeness,
consistency, and clarity. Provide specific recommendations for improvement.
Focus on:
1. Sections that need more detail
2. Inconsistencies or contradictions
3. Clarity of language and explanations
4. Alignment with best practices for ML evaluation
Here's the YAML content:
```yaml
{yaml_content}
```
Provide your feedback in a structured format with specific, actionable recommendations.
"""
payload = {
"model": "llama-3.3-70b-versatile", # or another groq supported model
"messages": [
{"role": "user", "content": prompt}
]
}
response = requests.post(
"https://api.groq.com/openai/v1/chat/completions",
headers=headers,
json=payload
)
if response.status_code == 200:
return response.json()["choices"][0]["message"]["content"]
else:
return f"Error getting Groq LLM feedback: {response.status_code} - {response.text}"
except Exception as e:
return f"Error getting Groq LLM feedback: {str(e)}"
def save_eval_card(yaml_content, filename=None):
"""Save an eval card to the repository"""
try:
# Parse YAML to validate it
eval_data = yaml.safe_load(yaml_content)
# Generate filename if not provided
if not filename:
eval_name = eval_data.get("title", "Unnamed Evaluation")
# Clean filename
filename = re.sub(r'[^\w\-_]', '_', eval_name)
filename = f"{filename}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.yaml"
# Save file
file_path = os.path.join(EVAL_CARDS_DIR, filename)
with open(file_path, "w") as file:
file.write(yaml_content)
return True, file_path
except Exception as e:
return False, str(e)
def load_all_eval_cards():
"""Load all eval cards from the repository"""
eval_cards = []
for filename in os.listdir(EVAL_CARDS_DIR):
if filename.endswith(".yaml"):
file_path = os.path.join(EVAL_CARDS_DIR, filename)
try:
with open(file_path, "r") as file:
yaml_content = file.read()
eval_data = yaml.safe_load(yaml_content)
# Compute coverage score
score, score_details = compute_coverage_score(eval_data)
# Extract key metadata
eval_cards.append({
"filename": filename,
"title": eval_data.get("title", "Unnamed Evaluation"),
"summary": eval_data.get("summary", ""),
"authors": ", ".join(eval_data.get("metadata", {}).get("authors", [])),
"creation_date": eval_data.get("metadata", {}).get("creation_date", ""),
"coverage_score": score,
"score_details": score_details,
"yaml_content": yaml_content,
"data": eval_data
})
except Exception as e:
print(f"Error loading {filename}: {str(e)}")
return eval_cards
def format_eval_card_as_html(eval_card):
"""Format an eval card as HTML for display"""
html = f"""
<div style="border: 1px solid #ddd; padding: 15px; margin-bottom: 20px; border-radius: 5px;">
<h3>{eval_card['title']}</h3>
<p>{eval_card['summary']}</p>
<p><strong>Authors:</strong> {eval_card['authors']}</p>
<p><strong>Created:</strong> {eval_card['creation_date']}</p>
<p><strong>Coverage Score:</strong> {eval_card['coverage_score']}%</p>
<h4>Coverage by Section:</h4>
<table style="width: 100%; border-collapse: collapse;">
<tr>
<th style="text-align: left; padding: 5px; border-bottom: 1px solid #ddd;">Section</th>
<th style="text-align: right; padding: 5px; border-bottom: 1px solid #ddd;">Score</th>
<th style="text-align: right; padding: 5px; border-bottom: 1px solid #ddd;">Completion</th>
</tr>
"""
for section, details in eval_card['score_details'].items():
html += f"""
<tr>
<td style="padding: 5px; border-bottom: 1px solid #eee;">{section}</td>
<td style="text-align: right; padding: 5px; border-bottom: 1px solid #eee;">{details['score']}/{details['max_score']}</td>
<td style="text-align: right; padding: 5px; border-bottom: 1px solid #eee;">{details['completion_rate']}%</td>
</tr>
"""
html += """
</table>
<div style="margin-top: 10px;">
<a href="#" onclick="viewYaml(this)" data-filename="{eval_card['filename']}" style="text-decoration: none; color: #3273dc;">View YAML</a>
</div>
</div>
"""
return html
def create_eval_cards_table(eval_cards):
"""Create an HTML table of eval cards"""
if not eval_cards:
return "<p>No evaluation cards found.</p>"
# Sort by coverage score (highest first)
eval_cards.sort(key=lambda x: x['coverage_score'], reverse=True)
html = ""
for eval_card in eval_cards:
html += format_eval_card_as_html(eval_card)
return html
def upload_file(file):
"""Process an uploaded YAML file"""
if file is None:
return "No file uploaded", None
try:
yaml_content = file.decode("utf-8")
# Validate YAML
eval_data = yaml.safe_load(yaml_content)
return yaml_content, eval_data
except Exception as e:
return f"Error processing file: {str(e)}", None
def get_feedback(yaml_content):
"""Get LLM feedback on the eval card"""
if not yaml_content:
return "Please upload or paste a YAML file first."
# Use provided token or get from environment
api_token = os.environ.get("GROQ_API_KEY")
if not api_token:
return "Please provide an API token or set the GROQ_API_KEY environment variable."
feedback = get_llm_feedback(yaml_content, api_token)
return feedback
def submit_eval_card(yaml_content):
"""Submit an eval card to the repository"""
if not yaml_content:
return "Please upload or paste a YAML file first.", None, None
try:
# Validate YAML
eval_data = yaml.safe_load(yaml_content)
# Compute coverage score
score, score_details = compute_coverage_score(eval_data)
# Save eval card
success, file_path = save_eval_card(yaml_content)
if success:
return f"Evaluation card saved successfully! Coverage score: {score}%", score, score_details
else:
return f"Error saving evaluation card: {file_path}", None, None
except Exception as e:
return f"Error processing evaluation card: {str(e)}", None, None
def refresh_gallery():
"""Refresh the gallery of eval cards"""
eval_cards = load_all_eval_cards()
html = create_eval_cards_table(eval_cards)
# Convert data to pandas DataFrame for table view
table_data = []
for card in eval_cards:
table_data.append({
"Title": card["title"],
"Authors": card["authors"],
"Creation Date": card["creation_date"],
"Coverage Score": f"{card['coverage_score']}%"
})
df = pd.DataFrame(table_data)
return html, df if not df.empty else None
def handle_upload_tab(file_obj, yaml_text):
"""Handle upload tab actions - either use uploaded file or pasted text"""
if file_obj is not None:
yaml_content, eval_data = upload_file(file_obj)
return yaml_content
else:
return yaml_text
# Create the Gradio interface
with gr.Blocks(title="Evaluation Card Repository") as app:
with gr.Row():
with gr.Column(scale=2):
gr.Markdown("# Evaluation Card Repository")
gr.Markdown("""
This application allows you to upload, validate, and explore ML evaluation cards.
Upload your evaluation card in YAML format, get feedback from an LLM, and submit it to the repository.
""")
with gr.Tabs():
with gr.TabItem("Upload & Review"):
with gr.Row():
with gr.Column():
file_upload = gr.File(label="Upload YAML File", file_types=[".yaml", ".yml"])
with gr.Accordion("Or paste YAML content", open=False):
yaml_input = gr.TextArea(label="YAML Content", placeholder="Paste your YAML content here...", lines=10)
load_template_btn = gr.Button("Load Template")
# api_token = gr.Textbox(label="API Token (for LLM feedback)", type="password")
with gr.Row():
get_feedback_btn = gr.Button("Get LLM Feedback")
submit_btn = gr.Button("Submit Evaluation Card", variant="primary")
with gr.Column():
yaml_display = gr.TextArea(label="Current YAML", lines=20)
with gr.Accordion("LLM Feedback", open=True):
feedback_display = gr.Markdown()
with gr.Accordion("Submission Result", open=True):
result_display = gr.Markdown()
coverage_score = gr.Number(label="Coverage Score", visible=False)
coverage_details = gr.JSON(label="Coverage Details", visible=False)
with gr.TabItem("Gallery"):
refresh_btn = gr.Button("Refresh Gallery")
with gr.Tabs():
with gr.TabItem("Card View"):
gallery_html = gr.HTML()
with gr.TabItem("Table View"):
gallery_table = gr.DataFrame()
# Set up event handlers
load_template_btn.click(
fn=load_template,
outputs=[yaml_display]
)
file_upload.change(
fn=handle_upload_tab,
inputs=[file_upload, yaml_input],
outputs=[yaml_display]
)
yaml_input.change(
fn=lambda x: x,
inputs=[yaml_input],
outputs=[yaml_display]
)
get_feedback_btn.click(
fn=get_feedback,
inputs=[yaml_display],
outputs=[feedback_display]
)
submit_btn.click(
fn=submit_eval_card,
inputs=[yaml_display],
outputs=[result_display, coverage_score, coverage_details]
)
refresh_btn.click(
fn=refresh_gallery,
outputs=[gallery_html, gallery_table]
)
# Initialize the gallery on app start
app.load(
fn=refresh_gallery,
outputs=[gallery_html, gallery_table]
)
# Launch the app
if __name__ == "__main__":
app.launch()