kaikaidai commited on
Commit
fb2bb34
·
verified ·
1 Parent(s): 2aa1365

Synced repo using 'sync_with_huggingface' Github Action

Browse files
Files changed (1) hide show
  1. app.py +154 -59
app.py CHANGED
@@ -4,10 +4,11 @@ import random
4
  from typing import Tuple, Dict
5
  from langchain_core.messages import SystemMessage, HumanMessage, AIMessage
6
  from langchain.chat_models import init_chat_model
7
- from atla import Atla
8
  from dotenv import load_dotenv
 
9
 
10
- load_dotenv()
11
 
12
  # Set page config
13
  st.set_page_config(page_title="Meta-GPT", layout="wide")
@@ -15,45 +16,112 @@ st.set_page_config(page_title="Meta-GPT", layout="wide")
15
  # Configuration parameters
16
  QUALITY_THRESHOLD = 4.0 # Threshold for acceptable response quality
17
  MAX_ITERATIONS = 3 # Maximum number of refinement iterations
18
- EVAL_PROMPT = """
19
- Evaluate the response on the following dimensions, scoring each from 1-5 (where 5 is excellent):
20
 
21
- 1. Accuracy: Is the response factually correct and free from hallucination or misinformation?
22
- 2. Relevance: Does the response directly answer the user's question effectively?
23
- 3. Clarity: Is the response clearly structured and easily understandable?
24
- 4. Depth: Does the response provide sufficient detail, insight, or useful context?
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
- For each dimension, provide:
27
- - A numeric score (1-5)
 
 
 
 
 
 
 
 
 
 
28
  - A brief explanation justifying the score
29
  - Specific suggestions for improvement
 
30
 
31
- Then provide an overall average score and a concise summary of your evaluation.
32
- Your overall average score should be a single floating-point number between 1 and 5.
 
 
 
 
 
 
 
 
 
 
 
 
33
  """
34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
  # Initialize API keys from environment variables or Streamlit secrets
37
  def initialize_api_keys():
38
- # Check if we're running in Streamlit Cloud with secrets
39
- try:
40
- if hasattr(st, "secrets") and "OPENAI_API_KEY" in st.secrets:
41
- os.environ["OPENAI_API_KEY"] = st.secrets["OPENAI_API_KEY"]
42
- os.environ["ANTHROPIC_API_KEY"] = st.secrets["ANTHROPIC_API_KEY"]
43
- os.environ["TOGETHER_API_KEY"] = st.secrets["TOGETHER_API_KEY"]
44
- os.environ["ATLA_API_KEY"] = st.secrets["ATLA_API_KEY"]
45
- # Keys should be loaded from environment variables or .env file
46
- # No UI for API key input needed
47
- except Exception as e:
48
- st.sidebar.error(f"Error loading API keys: {e}")
 
 
49
 
50
 
51
  # Initialize models and session state
52
  def initialize_app():
53
- initialize_api_keys()
54
-
55
- # Initialize LLM clients if they don't exist or if API keys have been updated
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  if "initialized" not in st.session_state:
 
 
 
 
57
  try:
58
  st.session_state.gpt4o = init_chat_model("gpt-4o", model_provider="openai")
59
  st.session_state.claude = init_chat_model(
@@ -63,42 +131,76 @@ def initialize_app():
63
  "deepseek-ai/DeepSeek-V3", model_provider="together"
64
  )
65
  st.session_state.atla = Atla()
 
66
  st.session_state.initialized = True
67
 
68
- # Initialize chat messages
69
- if "chat_messages" not in st.session_state:
70
- st.session_state.chat_messages = [
71
- SystemMessage(
72
- content="You are a helpful assistant that can answer questions and help with tasks."
73
- )
74
- ]
75
-
76
- # Initialize chat history for display
77
- if "chat_history" not in st.session_state:
78
- st.session_state.chat_history = []
79
-
80
- # Initialize latest result
81
- if "latest_result" not in st.session_state:
82
- st.session_state.latest_result = None
83
-
84
  except Exception as e:
85
  st.error(f"Error initializing models: {e}")
86
- st.warning("Please check your API keys in the sidebar.")
87
  st.session_state.initialized = False
88
 
89
 
90
- def evaluate_with_atla(inputs: dict[str, str]) -> Tuple[float, str]:
91
- """Evaluate response using Atla's Selene model."""
92
- response = st.session_state.atla.evaluation.create(
93
  model_id="atla-selene",
94
- model_input=inputs["question"],
95
- model_output=inputs["response"],
96
- evaluation_criteria=EVAL_PROMPT,
97
  )
98
- evaluation = response.result.evaluation
99
  return float(evaluation.score), evaluation.critique
100
 
101
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  def get_responses(
103
  question: str, feedback: str = "", with_status: bool = True
104
  ) -> Dict[str, str]:
@@ -154,13 +256,6 @@ def get_responses(
154
  return responses
155
 
156
 
157
- def evaluate_response(question: str, response: str) -> Dict:
158
- """Evaluate a single response using Selene."""
159
- inputs = {"question": question, "response": response}
160
- score, critique = evaluate_with_atla(inputs)
161
- return {"score": score, "critique": critique}
162
-
163
-
164
  def evaluate_all_responses(
165
  question: str, responses: Dict[str, str], use_status: bool = True
166
  ) -> Dict[str, Dict]:
@@ -364,7 +459,7 @@ def display_evaluation_details():
364
  disabled=True,
365
  )
366
 
367
- st.write("**Atla Critique:**")
368
  st.write(refinement["evaluation"]["critique"])
369
 
370
  # Model comparison
@@ -380,7 +475,7 @@ def display_evaluation_details():
380
  disabled=True,
381
  )
382
 
383
- st.write("**Atla Critique:**")
384
  st.write(eval_data["critique"])
385
 
386
 
 
4
  from typing import Tuple, Dict
5
  from langchain_core.messages import SystemMessage, HumanMessage, AIMessage
6
  from langchain.chat_models import init_chat_model
7
+ from atla import Atla, AsyncAtla
8
  from dotenv import load_dotenv
9
+ import asyncio
10
 
11
+ load_dotenv(dotenv_path="/.env")
12
 
13
  # Set page config
14
  st.set_page_config(page_title="Meta-GPT", layout="wide")
 
16
  # Configuration parameters
17
  QUALITY_THRESHOLD = 4.0 # Threshold for acceptable response quality
18
  MAX_ITERATIONS = 3 # Maximum number of refinement iterations
 
 
19
 
20
+ # Split the evaluation prompt into separate dimensions
21
+ ACCURACY_PROMPT = """
22
+ Evaluate the response on Accuracy: Is the response factually correct and free from hallucination or misinformation?
23
+
24
+ Scoring Rubric:
25
+ Score 1: The response contains numerous factual errors or completely fabricated information.
26
+ Score 2: The response contains major factual errors or significant hallucinations.
27
+ Score 3: The response contains some factual inaccuracies, but they are not significant.
28
+ Score 4: The response is factually sound with only minor inaccuracies.
29
+ Score 5: The response is factually flawless and completely accurate.
30
+
31
+ Provide:
32
+ - A numeric score (1-5, where 5 is excellent)
33
+ - A brief explanation justifying the score
34
+ - Specific suggestions for improvement
35
+ """
36
 
37
+ RELEVANCE_PROMPT = """
38
+ Evaluate the response on Relevance: Does the response directly answer the user's question effectively?
39
+
40
+ Scoring Rubric:
41
+ Score 1: The response completely misses the point of the question.
42
+ Score 2: The response addresses the general topic but fails to answer the specific question.
43
+ Score 3: The response partially answers the question but misses key aspects.
44
+ Score 4: The response answers the question well but could be more focused or complete.
45
+ Score 5: The response perfectly addresses all aspects of the question.
46
+
47
+ Provide:
48
+ - A numeric score (1-5, where 5 is excellent)
49
  - A brief explanation justifying the score
50
  - Specific suggestions for improvement
51
+ """
52
 
53
+ CLARITY_PROMPT = """
54
+ Evaluate the response on Clarity: Is the response clearly structured and easily understandable?
55
+
56
+ Scoring Rubric:
57
+ Score 1: The response is extremely confusing and poorly structured.
58
+ Score 2: The response is difficult to follow with major organizational issues.
59
+ Score 3: The response is somewhat clear but has organizational or expression issues.
60
+ Score 4: The response is well-structured with only minor clarity issues.
61
+ Score 5: The response is exceptionally clear, well-organized, and easy to understand.
62
+
63
+ Provide:
64
+ - A numeric score (1-5, where 5 is excellent)
65
+ - A brief explanation justifying the score
66
+ - Specific suggestions for improvement
67
  """
68
 
69
+ DEPTH_PROMPT = """
70
+ Evaluate the response on Depth: Does the response provide sufficient detail, insight, or useful context?
71
+
72
+ Scoring Rubric:
73
+ Score 1: The response is extremely shallow with no meaningful detail or insight.
74
+ Score 2: The response lacks significant depth and provides minimal useful information.
75
+ Score 3: The response provides some depth but misses opportunities for insight or context.
76
+ Score 4: The response offers good depth with useful details and context.
77
+ Score 5: The response provides exceptional depth with comprehensive details, valuable insights, and rich context.
78
+
79
+ Provide:
80
+ - A numeric score (1-5, where 5 is excellent)
81
+ - A brief explanation justifying the score
82
+ - Specific suggestions for improvement
83
+ """
84
 
85
  # Initialize API keys from environment variables or Streamlit secrets
86
  def initialize_api_keys():
87
+ # Load from .env file (already done via load_dotenv() at the top of your script)
88
+ # No need to check for Streamlit secrets if you're using .env exclusively
89
+
90
+ # Check if required keys are in environment variables
91
+ required_keys = ["OPENAI_API_KEY", "ANTHROPIC_API_KEY", "TOGETHER_API_KEY", "ATLA_API_KEY"]
92
+ missing_keys = [key for key in required_keys if not os.environ.get(key)]
93
+
94
+ if missing_keys:
95
+ st.sidebar.error(f"Missing API keys: {', '.join(missing_keys)}")
96
+ st.sidebar.info("Please add these keys to your .env file")
97
+ return False
98
+
99
+ return True
100
 
101
 
102
  # Initialize models and session state
103
  def initialize_app():
104
+ keys_loaded = initialize_api_keys()
105
+
106
+ # Initialize session state variables if they don't exist
107
+ if "chat_history" not in st.session_state:
108
+ st.session_state.chat_history = []
109
+
110
+ if "chat_messages" not in st.session_state:
111
+ st.session_state.chat_messages = [
112
+ SystemMessage(
113
+ content="You are a helpful assistant that can answer questions and help with tasks."
114
+ )
115
+ ]
116
+
117
+ if "latest_result" not in st.session_state:
118
+ st.session_state.latest_result = None
119
+
120
  if "initialized" not in st.session_state:
121
+ st.session_state.initialized = False
122
+
123
+ # Only initialize models if keys are loaded and not already initialized
124
+ if not st.session_state.initialized and keys_loaded:
125
  try:
126
  st.session_state.gpt4o = init_chat_model("gpt-4o", model_provider="openai")
127
  st.session_state.claude = init_chat_model(
 
131
  "deepseek-ai/DeepSeek-V3", model_provider="together"
132
  )
133
  st.session_state.atla = Atla()
134
+ st.session_state.async_atla = AsyncAtla()
135
  st.session_state.initialized = True
136
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
  except Exception as e:
138
  st.error(f"Error initializing models: {e}")
139
+ st.warning("Please check your API keys in the .env file.")
140
  st.session_state.initialized = False
141
 
142
 
143
+ async def evaluate_dimension(question: str, response: str, dimension_prompt: str) -> Tuple[float, str]:
144
+ """Evaluate a single dimension using Atla's Selene model asynchronously."""
145
+ eval_response = await st.session_state.async_atla.evaluation.create(
146
  model_id="atla-selene",
147
+ model_input=question,
148
+ model_output=response,
149
+ evaluation_criteria=dimension_prompt,
150
  )
151
+ evaluation = eval_response.result.evaluation
152
  return float(evaluation.score), evaluation.critique
153
 
154
 
155
+ async def evaluate_with_atla_async(inputs: dict[str, str]) -> Tuple[float, Dict[str, Dict]]:
156
+ """Evaluate response using Atla's Selene model across all dimensions asynchronously."""
157
+ # Create tasks for all dimensions
158
+ accuracy_task = evaluate_dimension(inputs["question"], inputs["response"], ACCURACY_PROMPT)
159
+ relevance_task = evaluate_dimension(inputs["question"], inputs["response"], RELEVANCE_PROMPT)
160
+ clarity_task = evaluate_dimension(inputs["question"], inputs["response"], CLARITY_PROMPT)
161
+ depth_task = evaluate_dimension(inputs["question"], inputs["response"], DEPTH_PROMPT)
162
+
163
+ # Run all evaluations concurrently
164
+ accuracy_score, accuracy_critique = await accuracy_task
165
+ relevance_score, relevance_critique = await relevance_task
166
+ clarity_score, clarity_critique = await clarity_task
167
+ depth_score, depth_critique = await depth_task
168
+
169
+ # Calculate average score
170
+ avg_score = (accuracy_score + relevance_score + clarity_score + depth_score) / 4
171
+
172
+ # Compile detailed results
173
+ detailed_results = {
174
+ "accuracy": {"score": accuracy_score, "critique": accuracy_critique},
175
+ "relevance": {"score": relevance_score, "critique": relevance_critique},
176
+ "clarity": {"score": clarity_score, "critique": clarity_critique},
177
+ "depth": {"score": depth_score, "critique": depth_critique}
178
+ }
179
+
180
+ # Compile overall critique
181
+ overall_critique = f"""
182
+ Accuracy ({accuracy_score}/5): {accuracy_critique}
183
+
184
+ Relevance ({relevance_score}/5): {relevance_critique}
185
+
186
+ Clarity ({clarity_score}/5): {clarity_critique}
187
+
188
+ Depth ({depth_score}/5): {depth_critique}
189
+
190
+ **Overall Score: {avg_score:.2f}/5**
191
+ """
192
+
193
+ return avg_score, overall_critique, detailed_results
194
+
195
+
196
+ def evaluate_response(question: str, response: str) -> Dict:
197
+ """Evaluate a single response using Selene."""
198
+ inputs = {"question": question, "response": response}
199
+ # Use asyncio to run the async function
200
+ score, critique, detailed_results = asyncio.run(evaluate_with_atla_async(inputs))
201
+ return {"score": score, "critique": critique, "detailed_results": detailed_results}
202
+
203
+
204
  def get_responses(
205
  question: str, feedback: str = "", with_status: bool = True
206
  ) -> Dict[str, str]:
 
256
  return responses
257
 
258
 
 
 
 
 
 
 
 
259
  def evaluate_all_responses(
260
  question: str, responses: Dict[str, str], use_status: bool = True
261
  ) -> Dict[str, Dict]:
 
459
  disabled=True,
460
  )
461
 
462
+ st.write("**Atla Critique's across different dimensions:**")
463
  st.write(refinement["evaluation"]["critique"])
464
 
465
  # Model comparison
 
475
  disabled=True,
476
  )
477
 
478
+ st.write("**Atla Critique's across different dimensions:**")
479
  st.write(eval_data["critique"])
480
 
481