Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from llama_cpp import Llama | |
| from huggingface_hub import hf_hub_download | |
| # Load a user-specified model | |
| def load_user_model(repo_id, model_file): | |
| print(f"Downloading model {model_file} from repository {repo_id}...") | |
| local_path = hf_hub_download(repo_id=repo_id, filename=model_file) | |
| print(f"Model downloaded to: {local_path}") | |
| return Llama(model_path=local_path, n_ctx=2048, n_threads=8) | |
| # Generate a response using the specified model and prompt | |
| def generate_response(model, prompt): | |
| response = model(prompt, max_tokens=512, temperature=0.5) | |
| return response["choices"][0]["text"] | |
| # Evaluate responses using the LoRA evaluation model | |
| def evaluate_responses(prompt, repo_a, model_a, repo_b, model_b, evaluation_criteria): | |
| if len(evaluation_criteria) > 3: | |
| return "Error: Please select up to 3 evaluation criteria only." | |
| # Load models | |
| model_a_instance = load_user_model(repo_a, model_a) | |
| model_b_instance = load_user_model(repo_b, model_b) | |
| # Generate responses | |
| response_a = generate_response(model_a_instance, prompt) | |
| response_b = generate_response(model_b_instance, prompt) | |
| # Display generated responses | |
| print(f"Response A: {response_a}") | |
| print(f"Response B: {response_b}") | |
| # Format the evaluation prompt | |
| criteria_list = ", ".join(evaluation_criteria) | |
| evaluation_prompt = f""" | |
| Prompt: {prompt} | |
| Response A: {response_a} | |
| Response B: {response_b} | |
| Evaluation Criteria: {criteria_list} | |
| Please evaluate the responses based on the selected criteria. For each criterion, rate both responses on a scale from 1 to 4 and provide a justification. Finally, declare the winner (or 'draw' if they are equal). | |
| """ | |
| # Use the LoRA model to evaluate the responses | |
| evaluation_response = lora_model.create_completion( | |
| prompt=evaluation_prompt, | |
| max_tokens=512, | |
| temperature=0.5 | |
| ) | |
| evaluation_results = evaluation_response["choices"][0]["text"] | |
| # Combine results for display | |
| final_output = f""" | |
| Evaluation Results:\n{evaluation_results} | |
| """ | |
| return final_output, response_a, response_b | |
| # Load the LoRA evaluation model | |
| def load_lora_model(): | |
| repo_id = "KolumbusLindh/LoRA-4100" | |
| model_file = "unsloth.F16.gguf" | |
| print(f"Downloading LoRA evaluation model from repository {repo_id}...") | |
| local_path = hf_hub_download(repo_id=repo_id, filename=model_file) | |
| print(f"LoRA evaluation model downloaded to: {local_path}") | |
| return Llama(model_path=local_path, n_ctx=2048, n_threads=8) | |
| lora_model = load_lora_model() | |
| print("LoRA evaluation model loaded successfully!") | |
| # Gradio interface | |
| with gr.Blocks(title="LLM as a Judge") as demo: | |
| gr.Markdown("## LLM as a Judge 🧐") | |
| gr.Markdown("Welcome to the LLM as a Judge demo! This application uses the LoRA model to evaluate responses generated by two different models based on user-specified criteria. You can select up to 3 evaluation criteria and provide a prompt to generate responses from the models. The LoRA model will then evaluate the responses based on the selected criteria and determine the winner.") | |
| # Model inputs | |
| repo_a_input = gr.Textbox(label="Model A Repository", placeholder="Enter the Hugging Face repo name for Model A...", value="forestav/LoRA-2000") | |
| model_a_input = gr.Textbox(label="Model A File Name", placeholder="Enter the model filename for Model A...", value="unsloth.F16.gguf") | |
| repo_b_input = gr.Textbox(label="Model B Repository", placeholder="Enter the Hugging Face repo name for Model B...", value="KolumbusLindh/LoRA-4100") | |
| model_b_input = gr.Textbox(label="Model B File Name", placeholder="Enter the model filename for Model B...", value="unsloth.F16.gguf") | |
| # Prompt and criteria inputs | |
| prompt_input = gr.Textbox(label="Enter Prompt", placeholder="Enter the prompt here...", lines=3) | |
| criteria_dropdown = gr.CheckboxGroup( | |
| label="Select Up to 3 Evaluation Criteria", | |
| choices=["Clarity", "Completeness", "Accuracy", "Relevance", "User-Friendliness", "Depth", "Creativity"] | |
| ) | |
| # Button and outputs | |
| evaluate_button = gr.Button("Evaluate Models") | |
| with gr.Row(): | |
| with gr.Column(): | |
| response_a = gr.Textbox( | |
| label="Response A", | |
| placeholder="The response for Model A will appear here...", | |
| lines=20, | |
| interactive=False | |
| ) | |
| with gr.Column(): | |
| response_b = gr.Textbox( | |
| label="Response B", | |
| placeholder="The response for Model B will appear here...", | |
| lines=20, | |
| interactive=False | |
| ) | |
| evaluation_output = gr.Textbox( | |
| label="Evaluation Results", | |
| placeholder="The evaluation results will appear here...", | |
| lines=20, | |
| interactive=False | |
| ) | |
| # Link evaluation function | |
| evaluate_button.click( | |
| fn=evaluate_responses, | |
| inputs=[prompt_input, repo_a_input, model_a_input, repo_b_input, model_b_input, criteria_dropdown], | |
| outputs=[evaluation_output, response_a, response_b] | |
| ) | |
| # Launch app | |
| if __name__ == "__main__": | |
| demo.launch() | |