Spaces:
Sleeping
Sleeping
# Evaluation Card Template | |
title: "[Evaluation Name]" | |
summary: > | |
Brief description of the evaluation approach, its purpose, and scope. | |
metadata: | |
authors: [] | |
maintainers: [] | |
creation_date: "" | |
last_review_date: "" | |
next_review_date: "" | |
version_compatibility: [] | |
repository_link: "" # Link to the code repository | |
paper_link: "" # Link to the research paper | |
evaluation_design: | |
motivation: | |
scientific_needs: "" | |
approach_justification: "" | |
expected_benefits: "" | |
tradeoffs: "" | |
type_and_structure: | |
type: "" # benchmark, challenge, red teaming, deployment study, structured test | |
structure: "" | |
timeline: "" | |
key_design_decisions: [] | |
design_process: | |
stakeholder_consultation: "" | |
pilot_studies: [] | |
validation_approaches: [] | |
stakeholders_and_resources: | |
target_users: [] | |
required_expertise: [] | |
resource_requirements: [] | |
cost_considerations: "" | |
estimand: | |
target_construct: | |
primary_capability: "" | |
measurement_type: "" # representational or pragmatic | |
relationship_to_applications: "" | |
theoretical_framework: "" | |
scope_and_limitations: | |
coverage: "" | |
excluded_capabilities: [] | |
known_blind_spots: [] | |
theoretical_limitations: [] | |
assessment_components: | |
test_set: | |
data_sources: [] | |
sampling_methodology: "" | |
known_biases: [] | |
approach_to_duplicates: "" | |
data_quality: "" | |
challenge: | |
design_principles: [] | |
task_selection_criteria: [] | |
difficulty_progression: "" | |
time_constraints: "" | |
red_teaming: | |
probing_methodology: "" | |
coverage_strategy: "" | |
adversarial_approach: "" | |
safety_considerations: "" | |
deployment_study: | |
environment_characteristics: "" | |
integration_points: [] | |
success_criteria: [] | |
monitoring_approach: "" | |
estimator: | |
evaluation_protocol: | |
methodology: "" | |
control_measures: [] | |
handling_random_components: "" | |
reproducibility_requirements: "" | |
metrics: | |
primary_metrics: [] | |
aggregation_methodology: "" | |
task_weightings: {} | |
performance_bounds: {} | |
connection_to_outcomes: "" | |
metric_details: | |
- name: "" | |
definition: "" | |
implementation: "" | |
edge_cases: [] | |
statistical_properties: "" | |
baseline_values: {} | |
failure_modes: [] | |
technical_framework: | |
implementation_requirements: [] | |
time_constraints: "" | |
dependencies: [] | |
authentication_needs: "" | |
constraints_and_rules: | |
allowed_resources: [] | |
permitted_approaches: [] | |
optimization_constraints: [] | |
ethical_boundaries: [] | |
estimate: | |
required_reporting: | |
essential_metrics: [] | |
results_disaggregation: "" | |
uncertainty_quantification: "" | |
performance_variation: "" | |
resource_usage_reporting: "" | |
reproducibility_information: | |
documentation_requirements: [] | |
environment_specifications: "" | |
randomization_handling: "" | |
output_standardization: "" | |
results_communication: | |
visualization: | |
recommended_plots: [] | |
standardized_formats: [] | |
key_comparisons: [] | |
leaderboard_guidelines: | |
submission_process: "" | |
required_metadata: [] | |
known_issues_and_limitations: | |
validity_concerns: | |
construct_validity: "" | |
gaming_possibilities: "" | |
stability_considerations: "" | |
temporal_validity: "" | |
practical_limitations: | |
resource_constraints: "" | |
scalability_issues: "" | |
cost_factors: "" | |
time_boundaries: "" | |
bias_and_fairness: | |
known_biases: [] | |
representation_issues: "" | |
potential_impacts: "" | |
mitigation_approaches: [] | |
version_and_maintenance: | |
version_information: | |
version: "" | |
release_date: "" | |
change_history: [] | |
update_plans: "" | |
maintenance_protocol: | |
update_frequency: "" | |
deprecation_policy: "" | |
issue_reporting: "" | |
community_involvement: "" | |
criteria_for_updates: [] | |
breaking_change_policy: "" | |
backwards_compatibility: "" | |
migration_guides: "" | |
citation_and_usage: | |
citation_information: | |
recommended_citation: "" | |
related_publications: [] | |
licensing_details: "" | |
usage_guidelines: | |
recommended_applications: [] | |
inappropriate_uses: [] | |
implementation_best_practices: "" | |
ethical_considerations: "" | |
additional_notes: | |
related_evaluations: [] | |
future_directions: "" |