LLMsOnTrial / app.py
kaikaidai's picture
Synced repo using 'sync_with_huggingface' Github Action
f0da249 verified
raw
history blame
18 kB
import os
import streamlit as st
import random
from typing import Tuple, Dict
from langchain_core.messages import SystemMessage, HumanMessage, AIMessage
from langchain.chat_models import init_chat_model
from atla import Atla
from dotenv import load_dotenv
load_dotenv()
# Set page config
st.set_page_config(page_title="Meta-ChatGPT", layout="wide")
# Configuration parameters
QUALITY_THRESHOLD = 4.0 # Threshold for acceptable response quality
MAX_ITERATIONS = 3 # Maximum number of refinement iterations
EVAL_PROMPT = """
Evaluate the response on the following dimensions, scoring each from 1-5 (where 5 is excellent):
1. Accuracy: Is the response factually correct and free from hallucination or misinformation?
2. Relevance: Does the response directly answer the user's question effectively?
3. Clarity: Is the response clearly structured and easily understandable?
4. Depth: Does the response provide sufficient detail, insight, or useful context?
For each dimension, provide:
- A numeric score (1-5)
- A brief explanation justifying the score
- Specific suggestions for improvement
Then provide an overall average score and a concise summary of your evaluation.
Your overall average score should be a single floating-point number between 1 and 5.
"""
# Initialize API keys from environment variables or Streamlit secrets
def initialize_api_keys():
# Check if we're running in Streamlit Cloud with secrets
try:
if hasattr(st, "secrets") and "OPENAI_API_KEY" in st.secrets:
os.environ["OPENAI_API_KEY"] = st.secrets["OPENAI_API_KEY"]
os.environ["ANTHROPIC_API_KEY"] = st.secrets["ANTHROPIC_API_KEY"]
os.environ["TOGETHER_API_KEY"] = st.secrets["TOGETHER_API_KEY"]
os.environ["ATLA_API_KEY"] = st.secrets["ATLA_API_KEY"]
# Keys should be loaded from environment variables or .env file
# No UI for API key input needed
except Exception as e:
st.sidebar.error(f"Error loading API keys: {e}")
# Initialize models and session state
def initialize_app():
initialize_api_keys()
# Initialize LLM clients if they don't exist or if API keys have been updated
if "initialized" not in st.session_state:
try:
st.session_state.gpt4o = init_chat_model("gpt-4o", model_provider="openai")
st.session_state.claude = init_chat_model(
"claude-3-7-sonnet-20250219", model_provider="anthropic"
)
st.session_state.deepseek = init_chat_model(
"deepseek-ai/DeepSeek-V3", model_provider="together"
)
st.session_state.atla = Atla()
st.session_state.initialized = True
# Initialize chat messages
if "chat_messages" not in st.session_state:
st.session_state.chat_messages = [
SystemMessage(
content="You are a helpful assistant that can answer questions and help with tasks."
)
]
# Initialize chat history for display
if "chat_history" not in st.session_state:
st.session_state.chat_history = []
# Initialize latest result
if "latest_result" not in st.session_state:
st.session_state.latest_result = None
except Exception as e:
st.error(f"Error initializing models: {e}")
st.warning("Please check your API keys in the sidebar.")
st.session_state.initialized = False
def evaluate_with_atla(inputs: dict[str, str]) -> Tuple[float, str]:
"""Evaluate response using Atla's Selene model."""
response = st.session_state.atla.evaluation.create(
model_id="atla-selene",
model_input=inputs["question"],
model_output=inputs["response"],
evaluation_criteria=EVAL_PROMPT,
)
evaluation = response.result.evaluation
return float(evaluation.score), evaluation.critique
def get_responses(
question: str, feedback: str = "", with_status: bool = True
) -> Dict[str, str]:
"""Get responses from all LLMs for a given question."""
st.session_state.chat_messages.append(HumanMessage(content=question))
if feedback:
st.session_state.chat_messages.append(HumanMessage(content=feedback))
responses = {}
if with_status:
# Create progress trackers for each model
with st.status(
"Generating responses from all models...", expanded=True
) as status:
# Get response from GPT-4o
status.update(label="Getting response from GPT-4o...")
gpt_response = st.session_state.gpt4o.invoke(st.session_state.chat_messages)
responses["GPT-4o"] = gpt_response.content
# Get response from Claude
status.update(label="Getting response from Claude 3.7...")
claude_response = st.session_state.claude.invoke(
st.session_state.chat_messages
)
responses["Claude 3.7"] = claude_response.content
# Get response from DeepSeek
status.update(label="Getting response from DeepSeekV3.0...")
deepseek_response = st.session_state.deepseek.invoke(
st.session_state.chat_messages
)
responses["DeepSeekV3.0"] = deepseek_response.content
status.update(label="All responses generated successfully!", state="complete")
else:
# Get responses without status bar (for refinement)
st.write("Getting response from models...")
# Get response from GPT-4o
gpt_response = st.session_state.gpt4o.invoke(st.session_state.chat_messages)
responses["GPT-4o"] = gpt_response.content
# Get response from Claude
claude_response = st.session_state.claude.invoke(st.session_state.chat_messages)
responses["Claude 3.7"] = claude_response.content
# Get response from DeepSeek
deepseek_response = st.session_state.deepseek.invoke(
st.session_state.chat_messages
)
responses["DeepSeekV3.0"] = deepseek_response.content
return responses
def evaluate_response(question: str, response: str) -> Dict:
"""Evaluate a single response using Selene."""
inputs = {"question": question, "response": response}
score, critique = evaluate_with_atla(inputs)
return {"score": score, "critique": critique}
def evaluate_all_responses(
question: str, responses: Dict[str, str], use_status: bool = True
) -> Dict[str, Dict]:
"""Evaluate all responses and return their evaluations."""
evaluations = {}
if (
use_status and len(st.session_state.chat_history) <= 1
): # Only use status on initial response
with st.status("Evaluating responses with Selene...", expanded=True) as status:
for model_name, response in responses.items():
status.update(label=f"Evaluating {model_name} response...")
evaluation = evaluate_response(question, response)
evaluations[model_name] = evaluation
status.update(label="All evaluations complete!", state="complete")
else:
# Simple version without status
st.write("Evaluating responses with Selene...")
for model_name, response in responses.items():
evaluation = evaluate_response(question, response)
evaluations[model_name] = evaluation
st.write("All evaluations complete!")
return evaluations
def select_best_response(evaluations: Dict[str, Dict]) -> Tuple[str, Dict]:
"""Select the best response based on overall score. Randomly choose if tied."""
best_score = -1
tied_models = []
for model_name, evaluation in evaluations.items():
overall_score = evaluation["score"]
if overall_score > best_score:
# New highest score - clear previous ties and start fresh
best_score = overall_score
tied_models = [(model_name, evaluation)]
elif overall_score == best_score:
# Tie detected - add to the list of tied models
tied_models.append((model_name, evaluation))
# If there are multiple models tied for the highest score, randomly select one
if tied_models:
best_model, best_evaluation = random.choice(tied_models)
return best_model, best_evaluation
def refine_responses(question: str, model: str, evaluation: Dict) -> Tuple[str, Dict]:
"""Refine a response based on Selene's critique."""
critique = evaluation["critique"]
feedback = f"Please improve your previous response based on this feedback: {critique}"
# Display refining message
st.write(f"Refining response with {model}...")
# Get improved responses without status bar (to avoid nesting)
improved_responses = get_responses(question, feedback, with_status=False)
improved_response = improved_responses[model]
# Re-evaluate the improved response
st.write("Re-evaluating refined response...")
new_evaluation = evaluate_response(question, improved_response)
st.write("Refinement complete!")
return improved_response, new_evaluation
def meta_chat(question: str) -> Dict:
"""Process user question through the Meta-ChatGPT system."""
iteration = 0
refinement_history = []
# Step 1: Get initial responses from all models
responses = get_responses(question)
# Step 2: Evaluate all responses
# Use status only for the first message
evaluations = evaluate_all_responses(
question, responses, use_status=len(st.session_state.chat_history) <= 1
)
# Step 3: Select best response
best_model, best_evaluation = select_best_response(evaluations)
best_response = responses[best_model]
st.session_state.chat_messages.append(AIMessage(content=best_response))
best_score = best_evaluation["score"]
# Record initial state
refinement_history.append(
{
"iteration": iteration,
"model": best_model,
"response": best_response,
"evaluation": best_evaluation,
"score": best_score,
}
)
# Step 4: Iterative refinement if score is below threshold
while best_score < QUALITY_THRESHOLD and iteration < MAX_ITERATIONS:
iteration += 1
st.info(
f"Response quality ({best_score:.2f}/5) below threshold ({QUALITY_THRESHOLD}/5). Refining..."
)
# Refine the best response based on feedback
improved_response, new_evaluation = refine_responses(
question, best_model, best_evaluation
)
new_score = new_evaluation["score"]
# Update best response if improved
if new_score > best_score:
best_response = improved_response
best_evaluation = new_evaluation
best_score = new_score
# Update the AI message in chat_messages
st.session_state.chat_messages[-1] = AIMessage(content=best_response)
# Record refinement state
refinement_history.append(
{
"iteration": iteration,
"model": best_model,
"response": improved_response,
"evaluation": new_evaluation,
"score": new_score,
}
)
# Step 5: Return final result
result = {
"question": question,
"best_model": best_model,
"best_response": best_response,
"best_score": best_score,
"iterations_required": iteration,
"all_evaluations": evaluations,
"refinement_history": refinement_history,
"threshold_met": best_score >= QUALITY_THRESHOLD,
"all_initial_responses": responses,
}
return result
def display_chat():
"""Display the chat interface and history."""
# Display chat history
for entry in st.session_state.chat_history:
if entry["role"] == "user":
with st.chat_message("user"):
st.markdown(entry["content"])
else:
# Use just "assistant" for avatar to avoid errors
with st.chat_message("assistant"):
st.markdown(entry["content"])
# Add a footnote with model and score info
st.caption(f"{entry['model']} (Score: {entry['score']:.2f}/5)")
def display_evaluation_details():
"""Display detailed evaluation information."""
if st.session_state.latest_result:
result = st.session_state.latest_result
# Display best model and score
st.subheader(f"Best Model: {result['best_model']}")
st.metric("Overall Score", f"{result['best_score']:.2f}/5")
# Refinement information
if result["iterations_required"] > 0:
st.subheader("Refinement Process")
st.write(
f"Required {result['iterations_required']} refinements to reach quality threshold."
)
# Create tabs for each refinement iteration
tabs = st.tabs(
["Initial"]
+ [f"Refinement {i+1}" for i in range(result["iterations_required"])]
)
for i, tab in enumerate(tabs):
if i < len(result["refinement_history"]):
refinement = result["refinement_history"][i]
with tab:
st.metric("Score", f"{refinement['score']:.2f}/5")
st.write("**Response:**")
st.text_area(
"Response Text",
value=refinement["response"],
height=150,
key=f"refinement_response_{i}",
disabled=True,
)
st.write("**Atla Critique:**")
st.write(refinement["evaluation"]["critique"])
# Model comparison
st.subheader("Model Comparison")
for model, eval_data in result["all_evaluations"].items():
with st.expander(f"{model}: {eval_data['score']:.2f}/5"):
st.write("**Initial Response:**")
st.text_area(
"Response",
value=result["all_initial_responses"][model],
height=150,
key=f"response_{model}",
disabled=True,
)
st.write("**Atla Critique:**")
st.write(eval_data["critique"])
def main():
"""Main app function"""
# Initialize the app
initialize_app()
# Initialize session state for sidebar visibility if not exists
if "show_analysis" not in st.session_state:
st.session_state.show_analysis = False
# Main content takes full width when analysis is collapsed
if st.session_state.get("latest_result") and st.session_state.show_analysis:
col1, col2 = st.columns([2, 1])
else:
# Use full width for main content when analysis is collapsed
col1 = st.container()
col2 = None # We won't use col2 when analysis is collapsed
with col1:
# Display header
st.title("πŸ€– Meta-ChatGPT with Selene")
st.markdown(
"""
This app uses multiple LLMs (GPT-4o, Claude 3.7, and DeepSeekV3.0) to answer your questions.
Selene evaluates each response, and the best one is selected and refined if needed.
"""
)
# Add toggle for analysis panel if we have results
if st.session_state.get("latest_result"):
toggle_col1, toggle_col2 = st.columns([4, 1])
with toggle_col2:
if st.button(
"πŸ“Š "
+ (
"Hide Analysis"
if st.session_state.show_analysis
else "Show Analysis"
)
):
st.session_state.show_analysis = not st.session_state.show_analysis
st.rerun()
# Display chat interface
display_chat()
# Check if API keys are configured
if not st.session_state.get("initialized", False):
st.warning("Please configure your API keys in the sidebar to continue.")
return
# Chat input
user_input = st.chat_input("Ask a question...")
# Use a separate column for evaluation details
if (
st.session_state.get("latest_result")
and st.session_state.show_analysis
and col2 is not None
):
with col2:
st.title("Response Analysis")
display_evaluation_details()
if user_input:
# Display user message
with st.chat_message("user"):
st.markdown(user_input)
# Add to history
st.session_state.chat_history.append({"role": "user", "content": user_input})
# Get meta chat response
with st.spinner("Processing your question..."):
result = meta_chat(user_input)
# Store latest result for sidebar display
st.session_state.latest_result = result
# Auto-expand the analysis panel when a new response comes in
st.session_state.show_analysis = True
# Display assistant message
with st.chat_message("assistant"):
st.markdown(result["best_response"])
st.caption(f"{result['best_model']} (Score: {result['best_score']:.2f}/5)")
# Add to history
st.session_state.chat_history.append(
{
"role": "assistant",
"content": result["best_response"],
"model": result["best_model"],
"score": result["best_score"],
}
)
# Force a refresh to update the evaluation details
st.rerun()
if __name__ == "__main__":
main()