File size: 17,999 Bytes
f0da249
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
import os
import streamlit as st
import random
from typing import Tuple, Dict
from langchain_core.messages import SystemMessage, HumanMessage, AIMessage
from langchain.chat_models import init_chat_model
from atla import Atla
from dotenv import load_dotenv

load_dotenv()

# Set page config
st.set_page_config(page_title="Meta-ChatGPT", layout="wide")

# Configuration parameters
QUALITY_THRESHOLD = 4.0  # Threshold for acceptable response quality
MAX_ITERATIONS = 3  # Maximum number of refinement iterations
EVAL_PROMPT = """
    Evaluate the response on the following dimensions, scoring each from 1-5 (where 5 is excellent):

    1. Accuracy: Is the response factually correct and free from hallucination or misinformation?
    2. Relevance: Does the response directly answer the user's question effectively?
    3. Clarity: Is the response clearly structured and easily understandable?
    4. Depth: Does the response provide sufficient detail, insight, or useful context?

    For each dimension, provide:
    - A numeric score (1-5)
    - A brief explanation justifying the score
    - Specific suggestions for improvement

    Then provide an overall average score and a concise summary of your evaluation.
    Your overall average score should be a single floating-point number between 1 and 5.
"""


# Initialize API keys from environment variables or Streamlit secrets
def initialize_api_keys():
    # Check if we're running in Streamlit Cloud with secrets
    try:
        if hasattr(st, "secrets") and "OPENAI_API_KEY" in st.secrets:
            os.environ["OPENAI_API_KEY"] = st.secrets["OPENAI_API_KEY"]
            os.environ["ANTHROPIC_API_KEY"] = st.secrets["ANTHROPIC_API_KEY"]
            os.environ["TOGETHER_API_KEY"] = st.secrets["TOGETHER_API_KEY"]
            os.environ["ATLA_API_KEY"] = st.secrets["ATLA_API_KEY"]
        # Keys should be loaded from environment variables or .env file
        # No UI for API key input needed
    except Exception as e:
        st.sidebar.error(f"Error loading API keys: {e}")


# Initialize models and session state
def initialize_app():
    initialize_api_keys()

    # Initialize LLM clients if they don't exist or if API keys have been updated
    if "initialized" not in st.session_state:
        try:
            st.session_state.gpt4o = init_chat_model("gpt-4o", model_provider="openai")
            st.session_state.claude = init_chat_model(
                "claude-3-7-sonnet-20250219", model_provider="anthropic"
            )
            st.session_state.deepseek = init_chat_model(
                "deepseek-ai/DeepSeek-V3", model_provider="together"
            )
            st.session_state.atla = Atla()
            st.session_state.initialized = True

            # Initialize chat messages
            if "chat_messages" not in st.session_state:
                st.session_state.chat_messages = [
                    SystemMessage(
                        content="You are a helpful assistant that can answer questions and help with tasks."
                    )
                ]

            # Initialize chat history for display
            if "chat_history" not in st.session_state:
                st.session_state.chat_history = []

            # Initialize latest result
            if "latest_result" not in st.session_state:
                st.session_state.latest_result = None

        except Exception as e:
            st.error(f"Error initializing models: {e}")
            st.warning("Please check your API keys in the sidebar.")
            st.session_state.initialized = False


def evaluate_with_atla(inputs: dict[str, str]) -> Tuple[float, str]:
    """Evaluate response using Atla's Selene model."""
    response = st.session_state.atla.evaluation.create(
        model_id="atla-selene",
        model_input=inputs["question"],
        model_output=inputs["response"],
        evaluation_criteria=EVAL_PROMPT,
    )
    evaluation = response.result.evaluation
    return float(evaluation.score), evaluation.critique


def get_responses(
    question: str, feedback: str = "", with_status: bool = True
) -> Dict[str, str]:
    """Get responses from all LLMs for a given question."""
    st.session_state.chat_messages.append(HumanMessage(content=question))
    if feedback:
        st.session_state.chat_messages.append(HumanMessage(content=feedback))
    responses = {}

    if with_status:
        # Create progress trackers for each model
        with st.status(
            "Generating responses from all models...", expanded=True
        ) as status:
            # Get response from GPT-4o
            status.update(label="Getting response from GPT-4o...")
            gpt_response = st.session_state.gpt4o.invoke(st.session_state.chat_messages)
            responses["GPT-4o"] = gpt_response.content

            # Get response from Claude
            status.update(label="Getting response from Claude 3.7...")
            claude_response = st.session_state.claude.invoke(
                st.session_state.chat_messages
            )
            responses["Claude 3.7"] = claude_response.content

            # Get response from DeepSeek
            status.update(label="Getting response from DeepSeekV3.0...")
            deepseek_response = st.session_state.deepseek.invoke(
                st.session_state.chat_messages
            )
            responses["DeepSeekV3.0"] = deepseek_response.content

            status.update(label="All responses generated successfully!", state="complete")
    else:
        # Get responses without status bar (for refinement)
        st.write("Getting response from models...")

        # Get response from GPT-4o
        gpt_response = st.session_state.gpt4o.invoke(st.session_state.chat_messages)
        responses["GPT-4o"] = gpt_response.content

        # Get response from Claude
        claude_response = st.session_state.claude.invoke(st.session_state.chat_messages)
        responses["Claude 3.7"] = claude_response.content

        # Get response from DeepSeek
        deepseek_response = st.session_state.deepseek.invoke(
            st.session_state.chat_messages
        )
        responses["DeepSeekV3.0"] = deepseek_response.content

    return responses


def evaluate_response(question: str, response: str) -> Dict:
    """Evaluate a single response using Selene."""
    inputs = {"question": question, "response": response}
    score, critique = evaluate_with_atla(inputs)
    return {"score": score, "critique": critique}


def evaluate_all_responses(
    question: str, responses: Dict[str, str], use_status: bool = True
) -> Dict[str, Dict]:
    """Evaluate all responses and return their evaluations."""
    evaluations = {}

    if (
        use_status and len(st.session_state.chat_history) <= 1
    ):  # Only use status on initial response
        with st.status("Evaluating responses with Selene...", expanded=True) as status:
            for model_name, response in responses.items():
                status.update(label=f"Evaluating {model_name} response...")
                evaluation = evaluate_response(question, response)
                evaluations[model_name] = evaluation

            status.update(label="All evaluations complete!", state="complete")
    else:
        # Simple version without status
        st.write("Evaluating responses with Selene...")
        for model_name, response in responses.items():
            evaluation = evaluate_response(question, response)
            evaluations[model_name] = evaluation
        st.write("All evaluations complete!")

    return evaluations


def select_best_response(evaluations: Dict[str, Dict]) -> Tuple[str, Dict]:
    """Select the best response based on overall score. Randomly choose if tied."""
    best_score = -1
    tied_models = []

    for model_name, evaluation in evaluations.items():
        overall_score = evaluation["score"]

        if overall_score > best_score:
            # New highest score - clear previous ties and start fresh
            best_score = overall_score
            tied_models = [(model_name, evaluation)]
        elif overall_score == best_score:
            # Tie detected - add to the list of tied models
            tied_models.append((model_name, evaluation))

    # If there are multiple models tied for the highest score, randomly select one
    if tied_models:
        best_model, best_evaluation = random.choice(tied_models)

    return best_model, best_evaluation


def refine_responses(question: str, model: str, evaluation: Dict) -> Tuple[str, Dict]:
    """Refine a response based on Selene's critique."""
    critique = evaluation["critique"]
    feedback = f"Please improve your previous response based on this feedback: {critique}"

    # Display refining message
    st.write(f"Refining response with {model}...")

    # Get improved responses without status bar (to avoid nesting)
    improved_responses = get_responses(question, feedback, with_status=False)
    improved_response = improved_responses[model]

    # Re-evaluate the improved response
    st.write("Re-evaluating refined response...")
    new_evaluation = evaluate_response(question, improved_response)

    st.write("Refinement complete!")

    return improved_response, new_evaluation


def meta_chat(question: str) -> Dict:
    """Process user question through the Meta-ChatGPT system."""
    iteration = 0
    refinement_history = []

    # Step 1: Get initial responses from all models
    responses = get_responses(question)

    # Step 2: Evaluate all responses
    # Use status only for the first message
    evaluations = evaluate_all_responses(
        question, responses, use_status=len(st.session_state.chat_history) <= 1
    )

    # Step 3: Select best response
    best_model, best_evaluation = select_best_response(evaluations)
    best_response = responses[best_model]
    st.session_state.chat_messages.append(AIMessage(content=best_response))
    best_score = best_evaluation["score"]

    # Record initial state
    refinement_history.append(
        {
            "iteration": iteration,
            "model": best_model,
            "response": best_response,
            "evaluation": best_evaluation,
            "score": best_score,
        }
    )

    # Step 4: Iterative refinement if score is below threshold
    while best_score < QUALITY_THRESHOLD and iteration < MAX_ITERATIONS:
        iteration += 1
        st.info(
            f"Response quality ({best_score:.2f}/5) below threshold ({QUALITY_THRESHOLD}/5). Refining..."
        )

        # Refine the best response based on feedback
        improved_response, new_evaluation = refine_responses(
            question, best_model, best_evaluation
        )
        new_score = new_evaluation["score"]

        # Update best response if improved
        if new_score > best_score:
            best_response = improved_response
            best_evaluation = new_evaluation
            best_score = new_score
            # Update the AI message in chat_messages
            st.session_state.chat_messages[-1] = AIMessage(content=best_response)

        # Record refinement state
        refinement_history.append(
            {
                "iteration": iteration,
                "model": best_model,
                "response": improved_response,
                "evaluation": new_evaluation,
                "score": new_score,
            }
        )

    # Step 5: Return final result
    result = {
        "question": question,
        "best_model": best_model,
        "best_response": best_response,
        "best_score": best_score,
        "iterations_required": iteration,
        "all_evaluations": evaluations,
        "refinement_history": refinement_history,
        "threshold_met": best_score >= QUALITY_THRESHOLD,
        "all_initial_responses": responses,
    }

    return result


def display_chat():
    """Display the chat interface and history."""
    # Display chat history
    for entry in st.session_state.chat_history:
        if entry["role"] == "user":
            with st.chat_message("user"):
                st.markdown(entry["content"])
        else:
            # Use just "assistant" for avatar to avoid errors
            with st.chat_message("assistant"):
                st.markdown(entry["content"])

                # Add a footnote with model and score info
                st.caption(f"{entry['model']} (Score: {entry['score']:.2f}/5)")


def display_evaluation_details():
    """Display detailed evaluation information."""
    if st.session_state.latest_result:
        result = st.session_state.latest_result

        # Display best model and score
        st.subheader(f"Best Model: {result['best_model']}")
        st.metric("Overall Score", f"{result['best_score']:.2f}/5")

        # Refinement information
        if result["iterations_required"] > 0:
            st.subheader("Refinement Process")
            st.write(
                f"Required {result['iterations_required']} refinements to reach quality threshold."
            )

            # Create tabs for each refinement iteration
            tabs = st.tabs(
                ["Initial"]
                + [f"Refinement {i+1}" for i in range(result["iterations_required"])]
            )

            for i, tab in enumerate(tabs):
                if i < len(result["refinement_history"]):
                    refinement = result["refinement_history"][i]
                    with tab:
                        st.metric("Score", f"{refinement['score']:.2f}/5")

                        st.write("**Response:**")
                        st.text_area(
                            "Response Text",
                            value=refinement["response"],
                            height=150,
                            key=f"refinement_response_{i}",
                            disabled=True,
                        )

                        st.write("**Atla Critique:**")
                        st.write(refinement["evaluation"]["critique"])

        # Model comparison
        st.subheader("Model Comparison")
        for model, eval_data in result["all_evaluations"].items():
            with st.expander(f"{model}: {eval_data['score']:.2f}/5"):
                st.write("**Initial Response:**")
                st.text_area(
                    "Response",
                    value=result["all_initial_responses"][model],
                    height=150,
                    key=f"response_{model}",
                    disabled=True,
                )

                st.write("**Atla Critique:**")
                st.write(eval_data["critique"])


def main():
    """Main app function"""
    # Initialize the app
    initialize_app()

    # Initialize session state for sidebar visibility if not exists
    if "show_analysis" not in st.session_state:
        st.session_state.show_analysis = False

    # Main content takes full width when analysis is collapsed
    if st.session_state.get("latest_result") and st.session_state.show_analysis:
        col1, col2 = st.columns([2, 1])
    else:
        # Use full width for main content when analysis is collapsed
        col1 = st.container()
        col2 = None  # We won't use col2 when analysis is collapsed

    with col1:
        # Display header
        st.title("πŸ€– Meta-ChatGPT with Selene")
        st.markdown(
            """
        This app uses multiple LLMs (GPT-4o, Claude 3.7, and DeepSeekV3.0) to answer your questions.
        Selene evaluates each response, and the best one is selected and refined if needed.
        """
        )

        # Add toggle for analysis panel if we have results
        if st.session_state.get("latest_result"):
            toggle_col1, toggle_col2 = st.columns([4, 1])
            with toggle_col2:
                if st.button(
                    "πŸ“Š "
                    + (
                        "Hide Analysis"
                        if st.session_state.show_analysis
                        else "Show Analysis"
                    )
                ):
                    st.session_state.show_analysis = not st.session_state.show_analysis
                    st.rerun()

        # Display chat interface
        display_chat()

        # Check if API keys are configured
        if not st.session_state.get("initialized", False):
            st.warning("Please configure your API keys in the sidebar to continue.")
            return

        # Chat input
        user_input = st.chat_input("Ask a question...")

    # Use a separate column for evaluation details
    if (
        st.session_state.get("latest_result")
        and st.session_state.show_analysis
        and col2 is not None
    ):
        with col2:
            st.title("Response Analysis")
            display_evaluation_details()

    if user_input:
        # Display user message
        with st.chat_message("user"):
            st.markdown(user_input)

        # Add to history
        st.session_state.chat_history.append({"role": "user", "content": user_input})

        # Get meta chat response
        with st.spinner("Processing your question..."):
            result = meta_chat(user_input)

        # Store latest result for sidebar display
        st.session_state.latest_result = result

        # Auto-expand the analysis panel when a new response comes in
        st.session_state.show_analysis = True

        # Display assistant message
        with st.chat_message("assistant"):
            st.markdown(result["best_response"])
            st.caption(f"{result['best_model']} (Score: {result['best_score']:.2f}/5)")

        # Add to history
        st.session_state.chat_history.append(
            {
                "role": "assistant",
                "content": result["best_response"],
                "model": result["best_model"],
                "score": result["best_score"],
            }
        )

        # Force a refresh to update the evaluation details
        st.rerun()


if __name__ == "__main__":
    main()