import os import streamlit as st import random from typing import Tuple, Dict from langchain_core.messages import SystemMessage, HumanMessage, AIMessage from langchain.chat_models import init_chat_model from atla import Atla from dotenv import load_dotenv load_dotenv() # Set page config st.set_page_config(page_title="Meta-ChatGPT", layout="wide") # Configuration parameters QUALITY_THRESHOLD = 4.0 # Threshold for acceptable response quality MAX_ITERATIONS = 3 # Maximum number of refinement iterations EVAL_PROMPT = """ Evaluate the response on the following dimensions, scoring each from 1-5 (where 5 is excellent): 1. Accuracy: Is the response factually correct and free from hallucination or misinformation? 2. Relevance: Does the response directly answer the user's question effectively? 3. Clarity: Is the response clearly structured and easily understandable? 4. Depth: Does the response provide sufficient detail, insight, or useful context? For each dimension, provide: - A numeric score (1-5) - A brief explanation justifying the score - Specific suggestions for improvement Then provide an overall average score and a concise summary of your evaluation. Your overall average score should be a single floating-point number between 1 and 5. """ # Initialize API keys from environment variables or Streamlit secrets def initialize_api_keys(): # Check if we're running in Streamlit Cloud with secrets try: if hasattr(st, "secrets") and "OPENAI_API_KEY" in st.secrets: os.environ["OPENAI_API_KEY"] = st.secrets["OPENAI_API_KEY"] os.environ["ANTHROPIC_API_KEY"] = st.secrets["ANTHROPIC_API_KEY"] os.environ["TOGETHER_API_KEY"] = st.secrets["TOGETHER_API_KEY"] os.environ["ATLA_API_KEY"] = st.secrets["ATLA_API_KEY"] # Keys should be loaded from environment variables or .env file # No UI for API key input needed except Exception as e: st.sidebar.error(f"Error loading API keys: {e}") # Initialize models and session state def initialize_app(): initialize_api_keys() # Initialize LLM clients if they don't exist or if API keys have been updated if "initialized" not in st.session_state: try: st.session_state.gpt4o = init_chat_model("gpt-4o", model_provider="openai") st.session_state.claude = init_chat_model( "claude-3-7-sonnet-20250219", model_provider="anthropic" ) st.session_state.deepseek = init_chat_model( "deepseek-ai/DeepSeek-V3", model_provider="together" ) st.session_state.atla = Atla() st.session_state.initialized = True # Initialize chat messages if "chat_messages" not in st.session_state: st.session_state.chat_messages = [ SystemMessage( content="You are a helpful assistant that can answer questions and help with tasks." ) ] # Initialize chat history for display if "chat_history" not in st.session_state: st.session_state.chat_history = [] # Initialize latest result if "latest_result" not in st.session_state: st.session_state.latest_result = None except Exception as e: st.error(f"Error initializing models: {e}") st.warning("Please check your API keys in the sidebar.") st.session_state.initialized = False def evaluate_with_atla(inputs: dict[str, str]) -> Tuple[float, str]: """Evaluate response using Atla's Selene model.""" response = st.session_state.atla.evaluation.create( model_id="atla-selene", model_input=inputs["question"], model_output=inputs["response"], evaluation_criteria=EVAL_PROMPT, ) evaluation = response.result.evaluation return float(evaluation.score), evaluation.critique def get_responses( question: str, feedback: str = "", with_status: bool = True ) -> Dict[str, str]: """Get responses from all LLMs for a given question.""" st.session_state.chat_messages.append(HumanMessage(content=question)) if feedback: st.session_state.chat_messages.append(HumanMessage(content=feedback)) responses = {} if with_status: # Create progress trackers for each model with st.status( "Generating responses from all models...", expanded=True ) as status: # Get response from GPT-4o status.update(label="Getting response from GPT-4o...") gpt_response = st.session_state.gpt4o.invoke(st.session_state.chat_messages) responses["GPT-4o"] = gpt_response.content # Get response from Claude status.update(label="Getting response from Claude 3.7...") claude_response = st.session_state.claude.invoke( st.session_state.chat_messages ) responses["Claude 3.7"] = claude_response.content # Get response from DeepSeek status.update(label="Getting response from DeepSeekV3.0...") deepseek_response = st.session_state.deepseek.invoke( st.session_state.chat_messages ) responses["DeepSeekV3.0"] = deepseek_response.content status.update(label="All responses generated successfully!", state="complete") else: # Get responses without status bar (for refinement) st.write("Getting response from models...") # Get response from GPT-4o gpt_response = st.session_state.gpt4o.invoke(st.session_state.chat_messages) responses["GPT-4o"] = gpt_response.content # Get response from Claude claude_response = st.session_state.claude.invoke(st.session_state.chat_messages) responses["Claude 3.7"] = claude_response.content # Get response from DeepSeek deepseek_response = st.session_state.deepseek.invoke( st.session_state.chat_messages ) responses["DeepSeekV3.0"] = deepseek_response.content return responses def evaluate_response(question: str, response: str) -> Dict: """Evaluate a single response using Selene.""" inputs = {"question": question, "response": response} score, critique = evaluate_with_atla(inputs) return {"score": score, "critique": critique} def evaluate_all_responses( question: str, responses: Dict[str, str], use_status: bool = True ) -> Dict[str, Dict]: """Evaluate all responses and return their evaluations.""" evaluations = {} if ( use_status and len(st.session_state.chat_history) <= 1 ): # Only use status on initial response with st.status("Evaluating responses with Selene...", expanded=True) as status: for model_name, response in responses.items(): status.update(label=f"Evaluating {model_name} response...") evaluation = evaluate_response(question, response) evaluations[model_name] = evaluation status.update(label="All evaluations complete!", state="complete") else: # Simple version without status st.write("Evaluating responses with Selene...") for model_name, response in responses.items(): evaluation = evaluate_response(question, response) evaluations[model_name] = evaluation st.write("All evaluations complete!") return evaluations def select_best_response(evaluations: Dict[str, Dict]) -> Tuple[str, Dict]: """Select the best response based on overall score. Randomly choose if tied.""" best_score = -1 tied_models = [] for model_name, evaluation in evaluations.items(): overall_score = evaluation["score"] if overall_score > best_score: # New highest score - clear previous ties and start fresh best_score = overall_score tied_models = [(model_name, evaluation)] elif overall_score == best_score: # Tie detected - add to the list of tied models tied_models.append((model_name, evaluation)) # If there are multiple models tied for the highest score, randomly select one if tied_models: best_model, best_evaluation = random.choice(tied_models) return best_model, best_evaluation def refine_responses(question: str, model: str, evaluation: Dict) -> Tuple[str, Dict]: """Refine a response based on Selene's critique.""" critique = evaluation["critique"] feedback = f"Please improve your previous response based on this feedback: {critique}" # Display refining message st.write(f"Refining response with {model}...") # Get improved responses without status bar (to avoid nesting) improved_responses = get_responses(question, feedback, with_status=False) improved_response = improved_responses[model] # Re-evaluate the improved response st.write("Re-evaluating refined response...") new_evaluation = evaluate_response(question, improved_response) st.write("Refinement complete!") return improved_response, new_evaluation def meta_chat(question: str) -> Dict: """Process user question through the Meta-ChatGPT system.""" iteration = 0 refinement_history = [] # Step 1: Get initial responses from all models responses = get_responses(question) # Step 2: Evaluate all responses # Use status only for the first message evaluations = evaluate_all_responses( question, responses, use_status=len(st.session_state.chat_history) <= 1 ) # Step 3: Select best response best_model, best_evaluation = select_best_response(evaluations) best_response = responses[best_model] st.session_state.chat_messages.append(AIMessage(content=best_response)) best_score = best_evaluation["score"] # Record initial state refinement_history.append( { "iteration": iteration, "model": best_model, "response": best_response, "evaluation": best_evaluation, "score": best_score, } ) # Step 4: Iterative refinement if score is below threshold while best_score < QUALITY_THRESHOLD and iteration < MAX_ITERATIONS: iteration += 1 st.info( f"Response quality ({best_score:.2f}/5) below threshold ({QUALITY_THRESHOLD}/5). Refining..." ) # Refine the best response based on feedback improved_response, new_evaluation = refine_responses( question, best_model, best_evaluation ) new_score = new_evaluation["score"] # Update best response if improved if new_score > best_score: best_response = improved_response best_evaluation = new_evaluation best_score = new_score # Update the AI message in chat_messages st.session_state.chat_messages[-1] = AIMessage(content=best_response) # Record refinement state refinement_history.append( { "iteration": iteration, "model": best_model, "response": improved_response, "evaluation": new_evaluation, "score": new_score, } ) # Step 5: Return final result result = { "question": question, "best_model": best_model, "best_response": best_response, "best_score": best_score, "iterations_required": iteration, "all_evaluations": evaluations, "refinement_history": refinement_history, "threshold_met": best_score >= QUALITY_THRESHOLD, "all_initial_responses": responses, } return result def display_chat(): """Display the chat interface and history.""" # Display chat history for entry in st.session_state.chat_history: if entry["role"] == "user": with st.chat_message("user"): st.markdown(entry["content"]) else: # Use just "assistant" for avatar to avoid errors with st.chat_message("assistant"): st.markdown(entry["content"]) # Add a footnote with model and score info st.caption(f"{entry['model']} (Score: {entry['score']:.2f}/5)") def display_evaluation_details(): """Display detailed evaluation information.""" if st.session_state.latest_result: result = st.session_state.latest_result # Display best model and score st.subheader(f"Best Model: {result['best_model']}") st.metric("Overall Score", f"{result['best_score']:.2f}/5") # Refinement information if result["iterations_required"] > 0: st.subheader("Refinement Process") st.write( f"Required {result['iterations_required']} refinements to reach quality threshold." ) # Create tabs for each refinement iteration tabs = st.tabs( ["Initial"] + [f"Refinement {i+1}" for i in range(result["iterations_required"])] ) for i, tab in enumerate(tabs): if i < len(result["refinement_history"]): refinement = result["refinement_history"][i] with tab: st.metric("Score", f"{refinement['score']:.2f}/5") st.write("**Response:**") st.text_area( "Response Text", value=refinement["response"], height=150, key=f"refinement_response_{i}", disabled=True, ) st.write("**Atla Critique:**") st.write(refinement["evaluation"]["critique"]) # Model comparison st.subheader("Model Comparison") for model, eval_data in result["all_evaluations"].items(): with st.expander(f"{model}: {eval_data['score']:.2f}/5"): st.write("**Initial Response:**") st.text_area( "Response", value=result["all_initial_responses"][model], height=150, key=f"response_{model}", disabled=True, ) st.write("**Atla Critique:**") st.write(eval_data["critique"]) def main(): """Main app function""" # Initialize the app initialize_app() # Initialize session state for sidebar visibility if not exists if "show_analysis" not in st.session_state: st.session_state.show_analysis = False # Main content takes full width when analysis is collapsed if st.session_state.get("latest_result") and st.session_state.show_analysis: col1, col2 = st.columns([2, 1]) else: # Use full width for main content when analysis is collapsed col1 = st.container() col2 = None # We won't use col2 when analysis is collapsed with col1: # Display header st.title("🤖 Meta-ChatGPT with Selene") st.markdown( """ This app uses multiple LLMs (GPT-4o, Claude 3.7, and DeepSeekV3.0) to answer your questions. Selene evaluates each response, and the best one is selected and refined if needed. """ ) # Add toggle for analysis panel if we have results if st.session_state.get("latest_result"): toggle_col1, toggle_col2 = st.columns([4, 1]) with toggle_col2: if st.button( "📊 " + ( "Hide Analysis" if st.session_state.show_analysis else "Show Analysis" ) ): st.session_state.show_analysis = not st.session_state.show_analysis st.rerun() # Display chat interface display_chat() # Check if API keys are configured if not st.session_state.get("initialized", False): st.warning("Please configure your API keys in the sidebar to continue.") return # Chat input user_input = st.chat_input("Ask a question...") # Use a separate column for evaluation details if ( st.session_state.get("latest_result") and st.session_state.show_analysis and col2 is not None ): with col2: st.title("Response Analysis") display_evaluation_details() if user_input: # Display user message with st.chat_message("user"): st.markdown(user_input) # Add to history st.session_state.chat_history.append({"role": "user", "content": user_input}) # Get meta chat response with st.spinner("Processing your question..."): result = meta_chat(user_input) # Store latest result for sidebar display st.session_state.latest_result = result # Auto-expand the analysis panel when a new response comes in st.session_state.show_analysis = True # Display assistant message with st.chat_message("assistant"): st.markdown(result["best_response"]) st.caption(f"{result['best_model']} (Score: {result['best_score']:.2f}/5)") # Add to history st.session_state.chat_history.append( { "role": "assistant", "content": result["best_response"], "model": result["best_model"], "score": result["best_score"], } ) # Force a refresh to update the evaluation details st.rerun() if __name__ == "__main__": main()