MrSimple07 commited on
Commit
e22fbd5
·
verified ·
1 Parent(s): 62e328f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -136
app.py CHANGED
@@ -1,147 +1,40 @@
1
  import gradio as gr
2
- import time
3
- import json
4
  import pandas as pd
5
- from typing import List, Dict, Any
6
 
7
- class BenchmarkSystem:
8
- def __init__(self):
9
- self.results = {}
10
-
11
- def run_benchmark(self,
12
- model_name: str,
13
- test_cases: List[str],
14
- system_prompt: str = "") -> Dict[str, Any]:
15
- """
16
- Run benchmark tests and measure performance metrics
17
- """
18
- results = {
19
- "model_name": model_name,
20
- "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
21
- "total_tokens": 0,
22
- "total_time": 0,
23
- "responses": [],
24
- "metrics": {}
25
- }
26
-
27
- start_time = time.time()
28
-
29
- # Simulate processing test cases
30
- for test in test_cases:
31
- # Here you would add actual model inference
32
- # This is a placeholder for demonstration
33
- time.sleep(0.5) # Simulate processing time
34
- results["responses"].append({
35
- "input": test,
36
- "output": f"Sample response for: {test}",
37
- "tokens": len(test.split()),
38
- "time": 0.5
39
- })
40
-
41
- results["total_time"] = time.time() - start_time
42
- results["total_tokens"] = sum(r["tokens"] for r in results["responses"])
43
-
44
- # Calculate aggregate metrics
45
- results["metrics"] = {
46
- "avg_response_time": results["total_time"] / len(test_cases),
47
- "avg_tokens_per_response": results["total_tokens"] / len(test_cases)
48
- }
49
-
50
- self.results[model_name] = results
51
- return results
52
 
53
- def format_results(results: Dict[str, Any]) -> str:
54
- """Format benchmark results for display"""
55
- output = f"Model: {results['model_name']}\n"
56
- output += f"Timestamp: {results['timestamp']}\n"
57
- output += f"Total Time: {results['total_time']:.2f}s\n"
58
- output += f"Total Tokens: {results['total_tokens']}\n\n"
59
-
60
- output += "Metrics:\n"
61
- for metric, value in results["metrics"].items():
62
- output += f"- {metric}: {value:.2f}\n"
63
-
64
- return output
65
 
66
- def save_results(results: Dict[str, Any], filename: str = "benchmark_results.json"):
67
- """Save benchmark results to a file"""
68
- with open(filename, "w") as f:
69
- json.dump(results, f, indent=2)
70
- return f"Results saved to {filename}"
71
 
72
- def run_benchmark_interface(model_name: str,
73
- test_cases: str,
74
- system_prompt: str) -> tuple[str, pd.DataFrame]:
75
- """
76
- Gradio interface function for running benchmarks
77
- """
78
- benchmark = BenchmarkSystem()
79
-
80
- # Parse test cases (assuming one per line)
81
- test_cases_list = [t.strip() for t in test_cases.split("\n") if t.strip()]
82
-
83
- # Run benchmark
84
- results = benchmark.run_benchmark(
85
- model_name=model_name,
86
- test_cases=test_cases_list,
87
- system_prompt=system_prompt
88
- )
89
-
90
- # Create DataFrame for response details
91
- df = pd.DataFrame([
92
- {
93
- "Input": r["input"],
94
- "Output": r["output"],
95
- "Tokens": r["tokens"],
96
- "Time (s)": r["time"]
97
- }
98
- for r in results["responses"]
99
- ])
100
-
101
- # Save results
102
- save_results(results)
103
-
104
- return format_results(results), df
105
 
106
- # Create Gradio interface
107
- with gr.Blocks(title="Model Benchmark Suite") as demo:
108
- gr.Markdown("# Model Benchmark Suite")
109
- gr.Markdown("Test and compare model performance across different scenarios")
110
-
111
- with gr.Row():
112
- with gr.Column():
113
- model_name = gr.Textbox(
114
- label="Model Name",
115
- placeholder="Enter model name or identifier"
116
- )
117
- system_prompt = gr.Textbox(
118
- label="System Prompt (Optional)",
119
- placeholder="Enter system prompt if applicable",
120
- lines=2
121
- )
122
- test_cases = gr.Textbox(
123
- label="Test Cases",
124
- placeholder="Enter test cases (one per line)",
125
- lines=5
126
- )
127
- run_button = gr.Button("Run Benchmark")
128
-
129
- with gr.Column():
130
- results_text = gr.Textbox(
131
- label="Benchmark Results",
132
- lines=10,
133
- readonly=True
134
- )
135
- results_table = gr.DataFrame(
136
- label="Detailed Results",
137
- headers=["Input", "Output", "Tokens", "Time (s)"]
138
- )
139
-
140
- run_button.click(
141
- fn=run_benchmark_interface,
142
- inputs=[model_name, test_cases, system_prompt],
143
- outputs=[results_text, results_table]
144
  )
 
 
 
 
145
 
146
  if __name__ == "__main__":
147
  demo.launch()
 
1
  import gradio as gr
 
 
2
  import pandas as pd
 
3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
+ #Models:
6
+ # IlyaGusev/saiga_llama3_8b
7
+ # Vikhrmodels/Vikhr-Nemo-12B-Instruct-R-21-09-24
8
+ # TinyLlama
9
+ # Google-gemma-2-27b-it
10
+ # mistralai/Mistral-Nemo-Instruct-2407
11
+ # Vikhrmodels/Vikhr-Qwen-2.5-0.5b-Instruct
 
 
 
 
 
12
 
 
 
 
 
 
13
 
14
+ benchmark_data = {
15
+ 'Model': ['IlyaGusev/saiga_llama3_8b', 'Vikhrmodels/Vikhr-Nemo-12B-Instruct-R-21-09-24', "TinyLlama", 'Google-gemma-2-27b-it', 'mistralai/Mistral-Nemo-Instruct-2407', 'Vikhrmodels/Vikhr-Qwen-2.5-0.5b-Instruct'],
16
+ 'Creativity Score': [78.5, 82.3, 85.7, 83.1, 85.6, 76.5, ],
17
+ 'Coherence Score': [75.2, 80.1, 84.3, 81.9, 88.5, 76.6],
18
+ 'Diversity Score': [25.3, 27.8, 31.2, 29.5, 88.4, 74.6]
19
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
+ def display_results():
22
+ df = pd.DataFrame(benchmark_data)
23
+ return df
24
+
25
+ # Create the interface
26
+ with gr.Blocks() as demo:
27
+ gr.Markdown("# Model Benchmark Results")
28
+
29
+ # Display results in a DataFrame
30
+ output = gr.Dataframe(
31
+ headers=["Model", "GLUE Score", "SQuAD F1", "MMLU Score"],
32
+ interactive=False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  )
34
+
35
+ # Button to refresh/display results
36
+ refresh_btn = gr.Button("Show Results")
37
+ refresh_btn.click(fn=display_results, outputs=output)
38
 
39
  if __name__ == "__main__":
40
  demo.launch()