Spaces:

forestav
/

llm-as-a-judge

Sleeping

llm-as-a-judge / app.py

Filip

update readme

afc14cf 8 months ago

5.19 kB

	import gradio as gr
	from llama_cpp import Llama
	from huggingface_hub import hf_hub_download

	# Load a user-specified model
	def load_user_model(repo_id, model_file):
	print(f"Downloading model {model_file} from repository {repo_id}...")
	local_path = hf_hub_download(repo_id=repo_id, filename=model_file)
	print(f"Model downloaded to: {local_path}")
	return Llama(model_path=local_path, n_ctx=2048, n_threads=8)

	# Generate a response using the specified model and prompt
	def generate_response(model, prompt):
	response = model(prompt, max_tokens=512, temperature=0.5)
	return response["choices"][0]["text"]

	# Evaluate responses using the LoRA evaluation model
	def evaluate_responses(prompt, repo_a, model_a, repo_b, model_b, evaluation_criteria):
	if len(evaluation_criteria) > 3:
	return "Error: Please select up to 3 evaluation criteria only."

	# Load models
	model_a_instance = load_user_model(repo_a, model_a)
	model_b_instance = load_user_model(repo_b, model_b)

	# Generate responses
	response_a = generate_response(model_a_instance, prompt)
	response_b = generate_response(model_b_instance, prompt)

	# Display generated responses
	print(f"Response A: {response_a}")
	print(f"Response B: {response_b}")

	# Format the evaluation prompt
	criteria_list = ", ".join(evaluation_criteria)
	evaluation_prompt = f"""
	Prompt: {prompt}

	Response A: {response_a}
	Response B: {response_b}

	Evaluation Criteria: {criteria_list}

	Please evaluate the responses based on the selected criteria. For each criterion, rate both responses on a scale from 1 to 4 and provide a justification. Finally, declare the winner (or 'draw' if they are equal).
	"""
	# Use the LoRA model to evaluate the responses
	evaluation_response = lora_model.create_completion(
	prompt=evaluation_prompt,
	max_tokens=512,
	temperature=0.5
	)
	evaluation_results = evaluation_response["choices"][0]["text"]

	# Combine results for display
	final_output = f"""
	Evaluation Results:\n{evaluation_results}
	"""
	return final_output, response_a, response_b

	# Load the LoRA evaluation model
	def load_lora_model():
	repo_id = "KolumbusLindh/LoRA-4100"
	model_file = "unsloth.F16.gguf"
	print(f"Downloading LoRA evaluation model from repository {repo_id}...")
	local_path = hf_hub_download(repo_id=repo_id, filename=model_file)
	print(f"LoRA evaluation model downloaded to: {local_path}")
	return Llama(model_path=local_path, n_ctx=2048, n_threads=8)

	lora_model = load_lora_model()
	print("LoRA evaluation model loaded successfully!")

	# Gradio interface
	with gr.Blocks(title="LLM as a Judge") as demo:
	gr.Markdown("## LLM as a Judge 🧐")
	gr.Markdown("Welcome to the LLM as a Judge demo! This application uses the LoRA model to evaluate responses generated by two different models based on user-specified criteria. You can select up to 3 evaluation criteria and provide a prompt to generate responses from the models. The LoRA model will then evaluate the responses based on the selected criteria and determine the winner.")

	# Model inputs
	repo_a_input = gr.Textbox(label="Model A Repository", placeholder="Enter the Hugging Face repo name for Model A...", value="forestav/LoRA-2000")
	model_a_input = gr.Textbox(label="Model A File Name", placeholder="Enter the model filename for Model A...", value="unsloth.F16.gguf")
	repo_b_input = gr.Textbox(label="Model B Repository", placeholder="Enter the Hugging Face repo name for Model B...", value="KolumbusLindh/LoRA-4100")
	model_b_input = gr.Textbox(label="Model B File Name", placeholder="Enter the model filename for Model B...", value="unsloth.F16.gguf")

	# Prompt and criteria inputs
	prompt_input = gr.Textbox(label="Enter Prompt", placeholder="Enter the prompt here...", lines=3)
	criteria_dropdown = gr.CheckboxGroup(
	label="Select Up to 3 Evaluation Criteria",
	choices=["Clarity", "Completeness", "Accuracy", "Relevance", "User-Friendliness", "Depth", "Creativity"]
	)

	# Button and outputs
	evaluate_button = gr.Button("Evaluate Models")

	with gr.Row():
	with gr.Column():
	response_a = gr.Textbox(
	label="Response A",
	placeholder="The response for Model A will appear here...",
	lines=20,
	interactive=False
	)

	with gr.Column():
	response_b = gr.Textbox(
	label="Response B",
	placeholder="The response for Model B will appear here...",
	lines=20,
	interactive=False
	)

	evaluation_output = gr.Textbox(
	label="Evaluation Results",
	placeholder="The evaluation results will appear here...",
	lines=20,
	interactive=False
	)

	# Link evaluation function
	evaluate_button.click(
	fn=evaluate_responses,
	inputs=[prompt_input, repo_a_input, model_a_input, repo_b_input, model_b_input, criteria_dropdown],
	outputs=[evaluation_output, response_a, response_b]
	)

	# Launch app
	if __name__ == "__main__":
	demo.launch()