eval-cards-gallery / template.yaml
n0w0f's picture
chore: update, cards
4c9761f
# Evaluation Card Template
title: "[Evaluation Name]"
summary: >
Brief description of the evaluation approach, its purpose, and scope.
metadata:
authors: []
maintainers: []
creation_date: ""
last_review_date: ""
next_review_date: ""
version_compatibility: []
repository_link: "" # Link to the code repository
paper_link: "" # Link to the research paper
evaluation_design:
motivation:
scientific_needs: ""
approach_justification: ""
expected_benefits: ""
tradeoffs: ""
type_and_structure:
type: "" # benchmark, challenge, red teaming, deployment study, structured test
structure: ""
timeline: ""
key_design_decisions: []
design_process:
stakeholder_consultation: ""
pilot_studies: []
validation_approaches: []
stakeholders_and_resources:
target_users: []
required_expertise: []
resource_requirements: []
cost_considerations: ""
estimand:
target_construct:
primary_capability: ""
measurement_type: "" # representational or pragmatic
relationship_to_applications: ""
theoretical_framework: ""
scope_and_limitations:
coverage: ""
excluded_capabilities: []
known_blind_spots: []
theoretical_limitations: []
assessment_components:
test_set:
data_sources: []
sampling_methodology: ""
known_biases: []
approach_to_duplicates: ""
data_quality: ""
challenge:
design_principles: []
task_selection_criteria: []
difficulty_progression: ""
time_constraints: ""
red_teaming:
probing_methodology: ""
coverage_strategy: ""
adversarial_approach: ""
safety_considerations: ""
deployment_study:
environment_characteristics: ""
integration_points: []
success_criteria: []
monitoring_approach: ""
estimator:
evaluation_protocol:
methodology: ""
control_measures: []
handling_random_components: ""
reproducibility_requirements: ""
metrics:
primary_metrics: []
aggregation_methodology: ""
task_weightings: {}
performance_bounds: {}
connection_to_outcomes: ""
metric_details:
- name: ""
definition: ""
implementation: ""
edge_cases: []
statistical_properties: ""
baseline_values: {}
failure_modes: []
technical_framework:
implementation_requirements: []
time_constraints: ""
dependencies: []
authentication_needs: ""
constraints_and_rules:
allowed_resources: []
permitted_approaches: []
optimization_constraints: []
ethical_boundaries: []
estimate:
required_reporting:
essential_metrics: []
results_disaggregation: ""
uncertainty_quantification: ""
performance_variation: ""
resource_usage_reporting: ""
reproducibility_information:
documentation_requirements: []
environment_specifications: ""
randomization_handling: ""
output_standardization: ""
results_communication:
visualization:
recommended_plots: []
standardized_formats: []
key_comparisons: []
leaderboard_guidelines:
submission_process: ""
required_metadata: []
known_issues_and_limitations:
validity_concerns:
construct_validity: ""
gaming_possibilities: ""
stability_considerations: ""
temporal_validity: ""
practical_limitations:
resource_constraints: ""
scalability_issues: ""
cost_factors: ""
time_boundaries: ""
bias_and_fairness:
known_biases: []
representation_issues: ""
potential_impacts: ""
mitigation_approaches: []
version_and_maintenance:
version_information:
version: ""
release_date: ""
change_history: []
update_plans: ""
maintenance_protocol:
update_frequency: ""
deprecation_policy: ""
issue_reporting: ""
community_involvement: ""
criteria_for_updates: []
breaking_change_policy: ""
backwards_compatibility: ""
migration_guides: ""
citation_and_usage:
citation_information:
recommended_citation: ""
related_publications: []
licensing_details: ""
usage_guidelines:
recommended_applications: []
inappropriate_uses: []
implementation_best_practices: ""
ethical_considerations: ""
additional_notes:
related_evaluations: []
future_directions: ""