Update app.py
Browse files
app.py
CHANGED
@@ -309,94 +309,108 @@ def evaluate_predictions(prediction_file, model_name, add_to_leaderboard):
|
|
309 |
|
310 |
initialize_leaderboard_file()
|
311 |
|
312 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
313 |
gr.Markdown("""
|
314 |
-
# Competition Title
|
315 |
-
### Welcome to the Competition Overview
|
316 |

|
317 |
-
|
|
|
|
|
318 |
""")
|
319 |
|
320 |
with gr.Tabs():
|
321 |
with gr.TabItem("π Overview"):
|
322 |
gr.Markdown("""
|
323 |
-
## Overview
|
324 |
-
|
325 |
-
|
326 |
-
Evaluate the performance of mobile-compatible Large Language Models (LLMs) on 16,186 scenario-based and factual questions across 80 fields. Compete to showcase your modelβs accuracy for real-world mobile scenarios.
|
327 |
-
|
328 |
-
## What is Mobile-MMLU?
|
329 |
-
|
330 |
-
Mobile-MMLU is a benchmark designed to test the capabilities of LLMs optimized for mobile use. By participating in this competition, you contribute to advancing mobile intelligence benchmarks and shaping the future of mobile-compatible AI systems.
|
331 |
-
|
332 |
---
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
2. **Generate Predictions**
|
340 |
-
Use your LLM to answer the questions
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
5. **Leaderboard**
|
349 |
-
Compare your results against other participants on the live leaderboard.
|
350 |
|
351 |
---
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
- **Accuracy**: Correctly answering questions across diverse fields.
|
357 |
---
|
|
|
|
|
|
|
|
|
358 |
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
1. **Prepare Your Model**
|
363 |
-
Refer to our [GitHub page](https://github.com/your-github-repo) for dataset access and response generation instructions.
|
364 |
-
|
365 |
-
2. **Submit Predictions**
|
366 |
-
Format your submission as specified in the rules.
|
367 |
-
|
368 |
-
3. **Track Progress**
|
369 |
-
Check the leaderboard for real-time rankings.
|
370 |
-
|
371 |
---
|
372 |
-
|
373 |
-
## Contact Us
|
374 |
-
|
375 |
-
For questions or support, contact us at: [Insert Email Address]
|
376 |
-
""")
|
377 |
|
378 |
with gr.TabItem("π€ Submission"):
|
379 |
with gr.Row():
|
380 |
-
file_input = gr.File(label="Upload Prediction CSV", file_types=[".csv"], interactive=True)
|
381 |
-
model_name_input = gr.Textbox(label="Model Name", placeholder="Enter your model name")
|
382 |
|
383 |
with gr.Row():
|
384 |
-
overall_accuracy_display = gr.Number(label="Overall Accuracy", interactive=False)
|
385 |
-
add_to_leaderboard_checkbox = gr.Checkbox(label="Add to Leaderboard?", value=True)
|
386 |
|
387 |
-
eval_button = gr.Button("Evaluate")
|
388 |
-
eval_status = gr.Textbox(label="Evaluation Status", interactive=False)
|
389 |
-
|
390 |
-
def handle_evaluation(file, model_name, add_to_leaderboard):
|
391 |
-
status, leaderboard = evaluate_predictions(file, model_name, add_to_leaderboard)
|
392 |
-
if leaderboard.empty:
|
393 |
-
overall_accuracy = 0
|
394 |
-
else:
|
395 |
-
overall_accuracy = leaderboard.iloc[-1]["Overall Accuracy"]
|
396 |
-
return status, overall_accuracy
|
397 |
|
398 |
eval_button.click(
|
399 |
-
|
400 |
inputs=[file_input, model_name_input, add_to_leaderboard_checkbox],
|
401 |
outputs=[eval_status, overall_accuracy_display],
|
402 |
)
|
@@ -404,18 +418,17 @@ For questions or support, contact us at: [Insert Email Address]
|
|
404 |
with gr.TabItem("π
Leaderboard"):
|
405 |
leaderboard_table = gr.Dataframe(
|
406 |
value=load_leaderboard(),
|
407 |
-
label="Leaderboard",
|
408 |
interactive=False,
|
409 |
wrap=True,
|
410 |
)
|
411 |
-
refresh_button = gr.Button("Refresh Leaderboard")
|
412 |
refresh_button.click(
|
413 |
lambda: load_leaderboard(),
|
414 |
inputs=[],
|
415 |
outputs=[leaderboard_table],
|
416 |
)
|
417 |
|
418 |
-
gr.Markdown(f"Last updated
|
419 |
|
420 |
demo.launch()
|
421 |
-
|
|
|
309 |
|
310 |
initialize_leaderboard_file()
|
311 |
|
312 |
+
|
313 |
+
# Function to set default mode
|
314 |
+
css_dark_mode = """
|
315 |
+
body {
|
316 |
+
background-color: #1e1e1e;
|
317 |
+
color: #ffffff;
|
318 |
+
}
|
319 |
+
|
320 |
+
a {
|
321 |
+
color: #4caf50;
|
322 |
+
}
|
323 |
+
|
324 |
+
a:hover {
|
325 |
+
color: #81c784;
|
326 |
+
text-decoration: underline;
|
327 |
+
}
|
328 |
+
|
329 |
+
button {
|
330 |
+
background-color: #4caf50;
|
331 |
+
color: #ffffff;
|
332 |
+
border-radius: 5px;
|
333 |
+
padding: 10px;
|
334 |
+
}
|
335 |
+
|
336 |
+
button:hover {
|
337 |
+
background-color: #81c784;
|
338 |
+
}
|
339 |
+
|
340 |
+
.input-row, .tab-content {
|
341 |
+
background-color: #333333;
|
342 |
+
border-radius: 8px;
|
343 |
+
padding: 15px;
|
344 |
+
}
|
345 |
+
|
346 |
+
.dataframe {
|
347 |
+
color: #ffffff;
|
348 |
+
background-color: #2e2e2e;
|
349 |
+
border: 1px solid #4caf50;
|
350 |
+
}
|
351 |
+
"""
|
352 |
+
|
353 |
+
with gr.Blocks(css=css_dark_mode) as demo:
|
354 |
gr.Markdown("""
|
355 |
+
# π **Competition Title**
|
356 |
+
### π **Welcome to the Competition Overview**
|
357 |

|
358 |
+
---
|
359 |
+
Welcome to the **Mobile-MMLU Benchmark Competition**. Here you can submit your predictions, view the leaderboard, and track your performance!
|
360 |
+
---
|
361 |
""")
|
362 |
|
363 |
with gr.Tabs():
|
364 |
with gr.TabItem("π Overview"):
|
365 |
gr.Markdown("""
|
366 |
+
## π Overview
|
367 |
+
Welcome to the **Mobile-MMLU Benchmark Competition**! Evaluate mobile-compatible Large Language Models (LLMs) on **16,186 scenario-based and factual questions** across **80 fields**.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
368 |
---
|
369 |
+
### π **What is Mobile-MMLU?**
|
370 |
+
Mobile-MMLU is a benchmark designed to test the capabilities of LLMs optimized for mobile use. Contribute to advancing mobile AI systems by competing to achieve the highest accuracy.
|
371 |
+
|
372 |
+
### π **How It Works**
|
373 |
+
1. **Download the Dataset**
|
374 |
+
Access the dataset and instructions on our [GitHub page](https://github.com/your-github-repo).
|
375 |
+
2. **Generate Predictions**
|
376 |
+
Use your LLM to answer the dataset questions. Format your predictions as a CSV file.
|
377 |
+
3. **Submit Predictions**
|
378 |
+
Upload your predictions on this platform.
|
379 |
+
4. **Evaluation**
|
380 |
+
Submissions are scored on accuracy.
|
381 |
+
5. **Leaderboard**
|
382 |
+
View real-time rankings on the leaderboard.
|
|
|
|
|
|
|
383 |
|
384 |
---
|
385 |
+
### π **Competition Tasks**
|
386 |
+
Participants must:
|
387 |
+
- Optimize their models for **accuracy**.
|
388 |
+
- Answer diverse field questions effectively.
|
|
|
389 |
---
|
390 |
+
### π **Get Started**
|
391 |
+
1. Prepare your model using resources on our [GitHub page](https://github.com/your-github-repo).
|
392 |
+
2. Submit predictions in the required format.
|
393 |
+
3. Track your progress on the leaderboard.
|
394 |
|
395 |
+
### π§ **Contact Us**
|
396 |
+
For support, email: [Insert Email Address]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
397 |
---
|
398 |
+
""")
|
|
|
|
|
|
|
|
|
399 |
|
400 |
with gr.TabItem("π€ Submission"):
|
401 |
with gr.Row():
|
402 |
+
file_input = gr.File(label="π Upload Prediction CSV", file_types=[".csv"], interactive=True)
|
403 |
+
model_name_input = gr.Textbox(label="ποΈ Model Name", placeholder="Enter your model name")
|
404 |
|
405 |
with gr.Row():
|
406 |
+
overall_accuracy_display = gr.Number(label="π
Overall Accuracy", interactive=False)
|
407 |
+
add_to_leaderboard_checkbox = gr.Checkbox(label="π Add to Leaderboard?", value=True)
|
408 |
|
409 |
+
eval_button = gr.Button("Evaluate", elem_id="evaluate-button")
|
410 |
+
eval_status = gr.Textbox(label="π’ Evaluation Status", interactive=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
411 |
|
412 |
eval_button.click(
|
413 |
+
evaluate_predictions,
|
414 |
inputs=[file_input, model_name_input, add_to_leaderboard_checkbox],
|
415 |
outputs=[eval_status, overall_accuracy_display],
|
416 |
)
|
|
|
418 |
with gr.TabItem("π
Leaderboard"):
|
419 |
leaderboard_table = gr.Dataframe(
|
420 |
value=load_leaderboard(),
|
421 |
+
label="π Leaderboard",
|
422 |
interactive=False,
|
423 |
wrap=True,
|
424 |
)
|
425 |
+
refresh_button = gr.Button("π Refresh Leaderboard")
|
426 |
refresh_button.click(
|
427 |
lambda: load_leaderboard(),
|
428 |
inputs=[],
|
429 |
outputs=[leaderboard_table],
|
430 |
)
|
431 |
|
432 |
+
gr.Markdown(f"**π
Last updated:** {LAST_UPDATED}")
|
433 |
|
434 |
demo.launch()
|
|