import gradio as gr import time import json import pandas as pd from typing import List, Dict, Any class BenchmarkSystem: def __init__(self): self.results = {} def run_benchmark(self, model_name: str, test_cases: List[str], system_prompt: str = "") -> Dict[str, Any]: """ Run benchmark tests and measure performance metrics """ results = { "model_name": model_name, "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"), "total_tokens": 0, "total_time": 0, "responses": [], "metrics": {} } start_time = time.time() # Simulate processing test cases for test in test_cases: # Here you would add actual model inference # This is a placeholder for demonstration time.sleep(0.5) # Simulate processing time results["responses"].append({ "input": test, "output": f"Sample response for: {test}", "tokens": len(test.split()), "time": 0.5 }) results["total_time"] = time.time() - start_time results["total_tokens"] = sum(r["tokens"] for r in results["responses"]) # Calculate aggregate metrics results["metrics"] = { "avg_response_time": results["total_time"] / len(test_cases), "avg_tokens_per_response": results["total_tokens"] / len(test_cases) } self.results[model_name] = results return results def format_results(results: Dict[str, Any]) -> str: """Format benchmark results for display""" output = f"Model: {results['model_name']}\n" output += f"Timestamp: {results['timestamp']}\n" output += f"Total Time: {results['total_time']:.2f}s\n" output += f"Total Tokens: {results['total_tokens']}\n\n" output += "Metrics:\n" for metric, value in results["metrics"].items(): output += f"- {metric}: {value:.2f}\n" return output def save_results(results: Dict[str, Any], filename: str = "benchmark_results.json"): """Save benchmark results to a file""" with open(filename, "w") as f: json.dump(results, f, indent=2) return f"Results saved to {filename}" def run_benchmark_interface(model_name: str, test_cases: str, system_prompt: str) -> tuple[str, pd.DataFrame]: """ Gradio interface function for running benchmarks """ benchmark = BenchmarkSystem() # Parse test cases (assuming one per line) test_cases_list = [t.strip() for t in test_cases.split("\n") if t.strip()] # Run benchmark results = benchmark.run_benchmark( model_name=model_name, test_cases=test_cases_list, system_prompt=system_prompt ) # Create DataFrame for response details df = pd.DataFrame([ { "Input": r["input"], "Output": r["output"], "Tokens": r["tokens"], "Time (s)": r["time"] } for r in results["responses"] ]) # Save results save_results(results) return format_results(results), df # Create Gradio interface with gr.Blocks(title="Model Benchmark Suite") as demo: gr.Markdown("# Model Benchmark Suite") gr.Markdown("Test and compare model performance across different scenarios") with gr.Row(): with gr.Column(): model_name = gr.Textbox( label="Model Name", placeholder="Enter model name or identifier" ) system_prompt = gr.Textbox( label="System Prompt (Optional)", placeholder="Enter system prompt if applicable", lines=2 ) test_cases = gr.Textbox( label="Test Cases", placeholder="Enter test cases (one per line)", lines=5 ) run_button = gr.Button("Run Benchmark") with gr.Column(): results_text = gr.Textbox( label="Benchmark Results", lines=10, readonly=True ) results_table = gr.DataFrame( label="Detailed Results", headers=["Input", "Output", "Tokens", "Time (s)"] ) run_button.click( fn=run_benchmark_interface, inputs=[model_name, test_cases, system_prompt], outputs=[results_text, results_table] ) if __name__ == "__main__": demo.launch()