Spaces:

MrSimple07
/

RuSimulBench_arena

Sleeping

App Files Files Community

MrSimple07 commited on Jan 4

Commit

e22fbd5

verified ·

1 Parent(s): 62e328f

Update app.py

Browse files

Files changed (1) hide show

app.py +29 -136

app.py CHANGED Viewed

@@ -1,147 +1,40 @@
 import gradio as gr
-import time
-import json
 import pandas as pd
-from typing import List, Dict, Any
-class BenchmarkSystem:
-    def __init__(self):
-        self.results = {}
-    def run_benchmark(self,
-                     model_name: str,
-                     test_cases: List[str],
-                     system_prompt: str = "") -> Dict[str, Any]:
-        """
-        Run benchmark tests and measure performance metrics
-        """
-        results = {
-            "model_name": model_name,
-            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
-            "total_tokens": 0,
-            "total_time": 0,
-            "responses": [],
-            "metrics": {}
-        }
-        start_time = time.time()
-        # Simulate processing test cases
-        for test in test_cases:
-            # Here you would add actual model inference
-            # This is a placeholder for demonstration
-            time.sleep(0.5)  # Simulate processing time
-            results["responses"].append({
-                "input": test,
-                "output": f"Sample response for: {test}",
-                "tokens": len(test.split()),
-                "time": 0.5
-            })
-        results["total_time"] = time.time() - start_time
-        results["total_tokens"] = sum(r["tokens"] for r in results["responses"])
-        # Calculate aggregate metrics
-        results["metrics"] = {
-            "avg_response_time": results["total_time"] / len(test_cases),
-            "avg_tokens_per_response": results["total_tokens"] / len(test_cases)
-        }
-        self.results[model_name] = results
-        return results
-def format_results(results: Dict[str, Any]) -> str:
-    """Format benchmark results for display"""
-    output = f"Model: {results['model_name']}\n"
-    output += f"Timestamp: {results['timestamp']}\n"
-    output += f"Total Time: {results['total_time']:.2f}s\n"
-    output += f"Total Tokens: {results['total_tokens']}\n\n"
-    output += "Metrics:\n"
-    for metric, value in results["metrics"].items():
-        output += f"- {metric}: {value:.2f}\n"
-    return output
-def save_results(results: Dict[str, Any], filename: str = "benchmark_results.json"):
-    """Save benchmark results to a file"""
-    with open(filename, "w") as f:
-        json.dump(results, f, indent=2)
-    return f"Results saved to {filename}"
-def run_benchmark_interface(model_name: str,
-                          test_cases: str,
-                          system_prompt: str) -> tuple[str, pd.DataFrame]:
-    """
-    Gradio interface function for running benchmarks
-    """
-    benchmark = BenchmarkSystem()
-    # Parse test cases (assuming one per line)
-    test_cases_list = [t.strip() for t in test_cases.split("\n") if t.strip()]
-    # Run benchmark
-    results = benchmark.run_benchmark(
-        model_name=model_name,
-        test_cases=test_cases_list,
-        system_prompt=system_prompt
-    )
-    # Create DataFrame for response details
-    df = pd.DataFrame([
-        {
-            "Input": r["input"],
-            "Output": r["output"],
-            "Tokens": r["tokens"],
-            "Time (s)": r["time"]
-        }
-        for r in results["responses"]
-    ])
-    # Save results
-    save_results(results)
-    return format_results(results), df
-# Create Gradio interface
-with gr.Blocks(title="Model Benchmark Suite") as demo:
-    gr.Markdown("# Model Benchmark Suite")
-    gr.Markdown("Test and compare model performance across different scenarios")
-    with gr.Row():
-        with gr.Column():
-            model_name = gr.Textbox(
-                label="Model Name",
-                placeholder="Enter model name or identifier"
-            )
-            system_prompt = gr.Textbox(
-                label="System Prompt (Optional)",
-                placeholder="Enter system prompt if applicable",
-                lines=2
-            )
-            test_cases = gr.Textbox(
-                label="Test Cases",
-                placeholder="Enter test cases (one per line)",
-                lines=5
-            )
-            run_button = gr.Button("Run Benchmark")
-        with gr.Column():
-            results_text = gr.Textbox(
-                label="Benchmark Results",
-                lines=10,
-                readonly=True
-            )
-            results_table = gr.DataFrame(
-                label="Detailed Results",
-                headers=["Input", "Output", "Tokens", "Time (s)"]
-            )
-    run_button.click(
-        fn=run_benchmark_interface,
-        inputs=[model_name, test_cases, system_prompt],
-        outputs=[results_text, results_table]
     )
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
 import pandas as pd
+#Models:
+# IlyaGusev/saiga_llama3_8b
+# Vikhrmodels/Vikhr-Nemo-12B-Instruct-R-21-09-24
+# TinyLlama
+# Google-gemma-2-27b-it
+# mistralai/Mistral-Nemo-Instruct-2407
+# Vikhrmodels/Vikhr-Qwen-2.5-0.5b-Instruct
+benchmark_data = {
+    'Model': ['IlyaGusev/saiga_llama3_8b', 'Vikhrmodels/Vikhr-Nemo-12B-Instruct-R-21-09-24', "TinyLlama", 'Google-gemma-2-27b-it', 'mistralai/Mistral-Nemo-Instruct-2407', 'Vikhrmodels/Vikhr-Qwen-2.5-0.5b-Instruct'],
+    'Creativity Score': [78.5, 82.3, 85.7, 83.1, 85.6, 76.5, ],
+    'Coherence Score': [75.2, 80.1, 84.3, 81.9, 88.5, 76.6],
+    'Diversity Score': [25.3, 27.8, 31.2, 29.5, 88.4, 74.6]
+}
+def display_results():
+    df = pd.DataFrame(benchmark_data)
+    return df
+# Create the interface
+with gr.Blocks() as demo:
+    gr.Markdown("# Model Benchmark Results")
+    # Display results in a DataFrame
+    output = gr.Dataframe(
+        headers=["Model", "GLUE Score", "SQuAD F1", "MMLU Score"],
+        interactive=False
     )
+    # Button to refresh/display results
+    refresh_btn = gr.Button("Show Results")
+    refresh_btn.click(fn=display_results, outputs=output)
 if __name__ == "__main__":
     demo.launch()