Spaces:
Sleeping
Sleeping
import datetime | |
import os | |
import re | |
from pathlib import Path | |
import gradio as gr | |
import pandas as pd | |
import requests | |
import yaml | |
# Constants | |
EVAL_CARDS_DIR = "eval_cards" | |
TEMPLATE_PATH = "template.yaml" | |
DEFAULT_MODEL = "anthropic/claude-3-haiku-20240307" # Or any other model available on HF | |
# Ensure the eval cards directory exists | |
os.makedirs(EVAL_CARDS_DIR, exist_ok=True) | |
# Copy the template to the appropriate location | |
with open("template.yaml", "w") as f: | |
with open("yaml_template.yaml", "r") as template_file: | |
f.write(template_file.read()) | |
def load_template(): | |
"""Load the YAML template""" | |
with open(TEMPLATE_PATH, "r") as file: | |
return file.read() | |
def yaml_to_dict(yaml_str): | |
"""Convert YAML string to Python dictionary""" | |
try: | |
return yaml.safe_load(yaml_str) | |
except yaml.YAMLError as e: | |
return {"error": str(e)} | |
def compute_coverage_score(eval_data): | |
""" | |
Compute a coverage score for the eval card | |
Returns a score from 0-100 and a breakdown of coverage by section | |
""" | |
sections = { | |
"metadata": 5, | |
"evaluation_design": 10, | |
"estimand": 20, | |
"estimator": 20, | |
"estimate": 20, | |
"results_communication": 10, | |
"known_issues_and_limitations": 10, | |
"version_and_maintenance": 5, | |
"citation_and_usage": 5, | |
} | |
scores = {} | |
total_score = 0 | |
def count_filled_fields(data, prefix=""): | |
if isinstance(data, dict): | |
filled = 0 | |
total = 0 | |
for key, value in data.items(): | |
if isinstance(value, (dict, list)): | |
sub_filled, sub_total = count_filled_fields(value, f"{prefix}.{key}" if prefix else key) | |
filled += sub_filled | |
total += sub_total | |
else: | |
total += 1 | |
if value and not (isinstance(value, str) and value.strip() in ["", "[]", "{}"]): | |
filled += 1 | |
return filled, total | |
elif isinstance(data, list): | |
if not data: | |
return 0, 1 | |
filled = 0 | |
total = 0 | |
for item in data: | |
sub_filled, sub_total = count_filled_fields(item) | |
filled += sub_filled | |
total += sub_total | |
return filled, total | |
else: | |
return 1 if data else 0, 1 | |
# Compute scores for each section | |
for section, weight in sections.items(): | |
if section in eval_data: | |
filled, total = count_filled_fields(eval_data[section]) | |
completion_rate = filled / total if total > 0 else 0 | |
scores[section] = { | |
"score": round(completion_rate * weight, 2), | |
"max_score": weight, | |
"completion_rate": round(completion_rate * 100, 2), | |
"fields_filled": filled, | |
"fields_total": total | |
} | |
total_score += scores[section]["score"] | |
else: | |
scores[section] = { | |
"score": 0, | |
"max_score": weight, | |
"completion_rate": 0, | |
"fields_filled": 0, | |
"fields_total": 0 | |
} | |
return round(total_score, 2), scores | |
def get_llm_feedback(yaml_content, api_token=None): | |
""" | |
Get feedback on the eval card from Groq's LLM | |
Uses GROQ_API_KEY from environment variables if no token is provided | |
""" | |
import os | |
import requests | |
from dotenv import load_dotenv | |
# Load environment variables from .env file if it exists | |
load_dotenv() | |
# Use provided token or get from environment | |
api_token = api_token or os.environ.get("GROQ_API_KEY") | |
if not api_token: | |
return "API token is required for LLM feedback. Please set the GROQ_API_KEY environment variable or provide a token." | |
try: | |
headers = { | |
"Content-Type": "application/json", | |
"Authorization": f"Bearer {api_token}" | |
} | |
prompt = f""" | |
I'm reviewing an Evaluation Card in YAML format. Please analyze it for completeness, | |
consistency, and clarity. Provide specific recommendations for improvement. | |
Focus on: | |
1. Sections that need more detail | |
2. Inconsistencies or contradictions | |
3. Clarity of language and explanations | |
4. Alignment with best practices for ML evaluation | |
Here's the YAML content: | |
```yaml | |
{yaml_content} | |
``` | |
Provide your feedback in a structured format with specific, actionable recommendations. | |
""" | |
payload = { | |
"model": "llama-3.3-70b-versatile", # or another groq supported model | |
"messages": [ | |
{"role": "user", "content": prompt} | |
] | |
} | |
response = requests.post( | |
"https://api.groq.com/openai/v1/chat/completions", | |
headers=headers, | |
json=payload | |
) | |
if response.status_code == 200: | |
return response.json()["choices"][0]["message"]["content"] | |
else: | |
return f"Error getting Groq LLM feedback: {response.status_code} - {response.text}" | |
except Exception as e: | |
return f"Error getting Groq LLM feedback: {str(e)}" | |
def save_eval_card(yaml_content, filename=None): | |
"""Save an eval card to the repository""" | |
try: | |
# Parse YAML to validate it | |
eval_data = yaml.safe_load(yaml_content) | |
# Generate filename if not provided | |
if not filename: | |
eval_name = eval_data.get("title", "Unnamed Evaluation") | |
# Clean filename | |
filename = re.sub(r'[^\w\-_]', '_', eval_name) | |
filename = f"{filename}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.yaml" | |
# Save file | |
file_path = os.path.join(EVAL_CARDS_DIR, filename) | |
with open(file_path, "w") as file: | |
file.write(yaml_content) | |
return True, file_path | |
except Exception as e: | |
return False, str(e) | |
def load_all_eval_cards(): | |
"""Load all eval cards from the repository""" | |
eval_cards = [] | |
for filename in os.listdir(EVAL_CARDS_DIR): | |
if filename.endswith(".yaml"): | |
file_path = os.path.join(EVAL_CARDS_DIR, filename) | |
try: | |
with open(file_path, "r") as file: | |
yaml_content = file.read() | |
eval_data = yaml.safe_load(yaml_content) | |
# Compute coverage score | |
score, score_details = compute_coverage_score(eval_data) | |
# Extract key metadata | |
eval_cards.append({ | |
"filename": filename, | |
"title": eval_data.get("title", "Unnamed Evaluation"), | |
"summary": eval_data.get("summary", ""), | |
"authors": ", ".join(eval_data.get("metadata", {}).get("authors", [])), | |
"creation_date": eval_data.get("metadata", {}).get("creation_date", ""), | |
"coverage_score": score, | |
"score_details": score_details, | |
"yaml_content": yaml_content, | |
"data": eval_data | |
}) | |
except Exception as e: | |
print(f"Error loading {filename}: {str(e)}") | |
return eval_cards | |
def format_eval_card_as_html(eval_card): | |
"""Format an eval card as HTML for display""" | |
html = f""" | |
<div style="border: 1px solid #ddd; padding: 15px; margin-bottom: 20px; border-radius: 5px;"> | |
<h3>{eval_card['title']}</h3> | |
<p>{eval_card['summary']}</p> | |
<p><strong>Authors:</strong> {eval_card['authors']}</p> | |
<p><strong>Created:</strong> {eval_card['creation_date']}</p> | |
<p><strong>Coverage Score:</strong> {eval_card['coverage_score']}%</p> | |
<h4>Coverage by Section:</h4> | |
<table style="width: 100%; border-collapse: collapse;"> | |
<tr> | |
<th style="text-align: left; padding: 5px; border-bottom: 1px solid #ddd;">Section</th> | |
<th style="text-align: right; padding: 5px; border-bottom: 1px solid #ddd;">Score</th> | |
<th style="text-align: right; padding: 5px; border-bottom: 1px solid #ddd;">Completion</th> | |
</tr> | |
""" | |
for section, details in eval_card['score_details'].items(): | |
html += f""" | |
<tr> | |
<td style="padding: 5px; border-bottom: 1px solid #eee;">{section}</td> | |
<td style="text-align: right; padding: 5px; border-bottom: 1px solid #eee;">{details['score']}/{details['max_score']}</td> | |
<td style="text-align: right; padding: 5px; border-bottom: 1px solid #eee;">{details['completion_rate']}%</td> | |
</tr> | |
""" | |
html += """ | |
</table> | |
<div style="margin-top: 10px;"> | |
<a href="#" onclick="viewYaml(this)" data-filename="{eval_card['filename']}" style="text-decoration: none; color: #3273dc;">View YAML</a> | |
</div> | |
</div> | |
""" | |
return html | |
def create_eval_cards_table(eval_cards): | |
"""Create an HTML table of eval cards""" | |
if not eval_cards: | |
return "<p>No evaluation cards found.</p>" | |
# Sort by coverage score (highest first) | |
eval_cards.sort(key=lambda x: x['coverage_score'], reverse=True) | |
html = "" | |
for eval_card in eval_cards: | |
html += format_eval_card_as_html(eval_card) | |
return html | |
def upload_file(file): | |
"""Process an uploaded YAML file""" | |
if file is None: | |
return "No file uploaded", None | |
try: | |
yaml_content = file.decode("utf-8") | |
# Validate YAML | |
eval_data = yaml.safe_load(yaml_content) | |
return yaml_content, eval_data | |
except Exception as e: | |
return f"Error processing file: {str(e)}", None | |
def get_feedback(yaml_content): | |
"""Get LLM feedback on the eval card""" | |
if not yaml_content: | |
return "Please upload or paste a YAML file first." | |
# Use provided token or get from environment | |
api_token = os.environ.get("GROQ_API_KEY") | |
if not api_token: | |
return "Please provide an API token or set the GROQ_API_KEY environment variable." | |
feedback = get_llm_feedback(yaml_content, api_token) | |
return feedback | |
def submit_eval_card(yaml_content): | |
"""Submit an eval card to the repository""" | |
if not yaml_content: | |
return "Please upload or paste a YAML file first.", None, None | |
try: | |
# Validate YAML | |
eval_data = yaml.safe_load(yaml_content) | |
# Compute coverage score | |
score, score_details = compute_coverage_score(eval_data) | |
# Save eval card | |
success, file_path = save_eval_card(yaml_content) | |
if success: | |
return f"Evaluation card saved successfully! Coverage score: {score}%", score, score_details | |
else: | |
return f"Error saving evaluation card: {file_path}", None, None | |
except Exception as e: | |
return f"Error processing evaluation card: {str(e)}", None, None | |
def refresh_gallery(): | |
"""Refresh the gallery of eval cards""" | |
eval_cards = load_all_eval_cards() | |
html = create_eval_cards_table(eval_cards) | |
# Convert data to pandas DataFrame for table view | |
table_data = [] | |
for card in eval_cards: | |
table_data.append({ | |
"Title": card["title"], | |
"Authors": card["authors"], | |
"Creation Date": card["creation_date"], | |
"Coverage Score": f"{card['coverage_score']}%" | |
}) | |
df = pd.DataFrame(table_data) | |
return html, df if not df.empty else None | |
def handle_upload_tab(file_obj, yaml_text): | |
"""Handle upload tab actions - either use uploaded file or pasted text""" | |
if file_obj is not None: | |
yaml_content, eval_data = upload_file(file_obj) | |
return yaml_content | |
else: | |
return yaml_text | |
# Create the Gradio interface | |
with gr.Blocks(title="Evaluation Card Repository") as app: | |
with gr.Row(): | |
with gr.Column(scale=2): | |
gr.Markdown("# Evaluation Card Repository") | |
gr.Markdown(""" | |
This application allows you to upload, validate, and explore ML evaluation cards. | |
Upload your evaluation card in YAML format, get feedback from an LLM, and submit it to the repository. | |
""") | |
with gr.Tabs(): | |
with gr.TabItem("Upload & Review"): | |
with gr.Row(): | |
with gr.Column(): | |
file_upload = gr.File(label="Upload YAML File", file_types=[".yaml", ".yml"]) | |
with gr.Accordion("Or paste YAML content", open=False): | |
yaml_input = gr.TextArea(label="YAML Content", placeholder="Paste your YAML content here...", lines=10) | |
load_template_btn = gr.Button("Load Template") | |
# api_token = gr.Textbox(label="API Token (for LLM feedback)", type="password") | |
with gr.Row(): | |
get_feedback_btn = gr.Button("Get LLM Feedback") | |
submit_btn = gr.Button("Submit Evaluation Card", variant="primary") | |
with gr.Column(): | |
yaml_display = gr.TextArea(label="Current YAML", lines=20) | |
with gr.Accordion("LLM Feedback", open=True): | |
feedback_display = gr.Markdown() | |
with gr.Accordion("Submission Result", open=True): | |
result_display = gr.Markdown() | |
coverage_score = gr.Number(label="Coverage Score", visible=False) | |
coverage_details = gr.JSON(label="Coverage Details", visible=False) | |
with gr.TabItem("Gallery"): | |
refresh_btn = gr.Button("Refresh Gallery") | |
with gr.Tabs(): | |
with gr.TabItem("Card View"): | |
gallery_html = gr.HTML() | |
with gr.TabItem("Table View"): | |
gallery_table = gr.DataFrame() | |
# Set up event handlers | |
load_template_btn.click( | |
fn=load_template, | |
outputs=[yaml_display] | |
) | |
file_upload.change( | |
fn=handle_upload_tab, | |
inputs=[file_upload, yaml_input], | |
outputs=[yaml_display] | |
) | |
yaml_input.change( | |
fn=lambda x: x, | |
inputs=[yaml_input], | |
outputs=[yaml_display] | |
) | |
get_feedback_btn.click( | |
fn=get_feedback, | |
inputs=[yaml_display], | |
outputs=[feedback_display] | |
) | |
submit_btn.click( | |
fn=submit_eval_card, | |
inputs=[yaml_display], | |
outputs=[result_display, coverage_score, coverage_details] | |
) | |
refresh_btn.click( | |
fn=refresh_gallery, | |
outputs=[gallery_html, gallery_table] | |
) | |
# Initialize the gallery on app start | |
app.load( | |
fn=refresh_gallery, | |
outputs=[gallery_html, gallery_table] | |
) | |
# Launch the app | |
if __name__ == "__main__": | |
app.launch() |