Spaces:
Runtime error
Runtime error
Commit
·
a4e6a71
1
Parent(s):
15bbe10
- __pycache__/main.cpython-310.pyc +0 -0
- __pycache__/models.cpython-310.pyc +0 -0
- app.py +178 -156
- main.py +14 -14
- models.py +16 -7
__pycache__/main.cpython-310.pyc
CHANGED
|
Binary files a/__pycache__/main.cpython-310.pyc and b/__pycache__/main.cpython-310.pyc differ
|
|
|
__pycache__/models.cpython-310.pyc
CHANGED
|
Binary files a/__pycache__/models.cpython-310.pyc and b/__pycache__/models.cpython-310.pyc differ
|
|
|
app.py
CHANGED
|
@@ -4,166 +4,188 @@ from models import chat_with_model, embed
|
|
| 4 |
from prompts import questions as predefined_questions, create_gen_prompt, create_judge_prompt
|
| 5 |
import requests
|
| 6 |
import numpy as np
|
| 7 |
-
import os
|
| 8 |
|
| 9 |
st.title("Aiden Bench - Generator")
|
| 10 |
|
| 11 |
# API Key Inputs with Security and User Experience Enhancements
|
| 12 |
-
st.warning("Please keep your API keys secure and confidential.")
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
model_names = [model["id"] for model in models]
|
| 30 |
-
except requests.exceptions.RequestException as e:
|
| 31 |
-
st.error(f"Error fetching models from OpenRouter API: {e}")
|
| 32 |
-
model_names = [] # Provide an empty list if API call fails
|
| 33 |
-
|
| 34 |
-
# Model Selection
|
| 35 |
-
if model_names:
|
| 36 |
-
model_name = st.selectbox("Select a Language Model", model_names)
|
| 37 |
-
else:
|
| 38 |
-
st.error("No models available. Please check your API connection.")
|
| 39 |
-
st.stop() # Stop execution if no models are available
|
| 40 |
-
|
| 41 |
-
# Initialize session state for user_questions and predefined_questions
|
| 42 |
-
if "user_questions" not in st.session_state:
|
| 43 |
-
st.session_state.user_questions = []
|
| 44 |
-
|
| 45 |
-
# Workflow Selection
|
| 46 |
-
workflow = st.radio("Select Workflow:", ["Use Predefined Questions", "Use User-Defined Questions"])
|
| 47 |
-
|
| 48 |
-
# Handle Predefined Questions
|
| 49 |
-
if workflow == "Use Predefined Questions":
|
| 50 |
-
st.header("Question Selection")
|
| 51 |
-
# Multiselect for predefined questions
|
| 52 |
-
selected_questions = st.multiselect(
|
| 53 |
-
"Select questions to benchmark:",
|
| 54 |
-
predefined_questions,
|
| 55 |
-
predefined_questions # Select all by default
|
| 56 |
-
)
|
| 57 |
-
|
| 58 |
-
# Handle User-Defined Questions
|
| 59 |
-
elif workflow == "Use User-Defined Questions":
|
| 60 |
-
st.header("Question Input")
|
| 61 |
-
|
| 62 |
-
# Input for adding a new question
|
| 63 |
-
new_question = st.text_input("Enter a new question:")
|
| 64 |
-
if st.button("Add Question") and new_question:
|
| 65 |
-
new_question = new_question.strip() # Remove leading/trailing whitespace
|
| 66 |
-
if new_question and new_question not in st.session_state.user_questions:
|
| 67 |
-
st.session_state.user_questions.append(new_question) # Append to session state
|
| 68 |
-
st.success(f"Question '{new_question}' added successfully.")
|
| 69 |
-
else:
|
| 70 |
-
st.warning("Question already exists or is empty!")
|
| 71 |
-
|
| 72 |
-
# Display multiselect with updated user questions
|
| 73 |
-
selected_questions = st.multiselect(
|
| 74 |
-
"Select your custom questions:",
|
| 75 |
-
options=st.session_state.user_questions,
|
| 76 |
-
default=st.session_state.user_questions
|
| 77 |
-
)
|
| 78 |
-
|
| 79 |
-
# Display selected questions
|
| 80 |
-
st.write("Selected Questions:", selected_questions)
|
| 81 |
-
|
| 82 |
-
# Benchmark Execution
|
| 83 |
-
if st.button("Start Benchmark"):
|
| 84 |
-
if not selected_questions:
|
| 85 |
-
st.warning("Please select at least one question.")
|
| 86 |
-
elif not open_router_key or not openai_api_key: # Check if API keys are provided
|
| 87 |
st.warning("Please enter both API keys.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
else:
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
})
|
| 169 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
from prompts import questions as predefined_questions, create_gen_prompt, create_judge_prompt
|
| 5 |
import requests
|
| 6 |
import numpy as np
|
| 7 |
+
import os
|
| 8 |
|
| 9 |
st.title("Aiden Bench - Generator")
|
| 10 |
|
| 11 |
# API Key Inputs with Security and User Experience Enhancements
|
| 12 |
+
st.warning("Please keep your API keys secure and confidential. This app does not store or log your API keys.")
|
| 13 |
+
st.write("Learn how to obtain API keys from Open Router and OpenAI.") # Add links or instructions here
|
| 14 |
+
|
| 15 |
+
if "open_router_key" not in st.session_state:
|
| 16 |
+
st.session_state.open_router_key = ""
|
| 17 |
+
if "openai_api_key" not in st.session_state:
|
| 18 |
+
st.session_state.openai_api_key = ""
|
| 19 |
+
|
| 20 |
+
open_router_key = st.text_input("Enter your Open Router API Key:", type="password", value=st.session_state.open_router_key)
|
| 21 |
+
openai_api_key = st.text_input("Enter your OpenAI API Key:", type="password", value=st.session_state.openai_api_key)
|
| 22 |
+
|
| 23 |
+
if st.button("Confirm API Keys"):
|
| 24 |
+
if open_router_key and openai_api_key:
|
| 25 |
+
st.session_state.open_router_key = open_router_key
|
| 26 |
+
st.session_state.openai_api_key = openai_api_key
|
| 27 |
+
st.success("API keys confirmed!")
|
| 28 |
+
else:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
st.warning("Please enter both API keys.")
|
| 30 |
+
|
| 31 |
+
# Access API keys from session state
|
| 32 |
+
if st.session_state.open_router_key and st.session_state.openai_api_key:
|
| 33 |
+
# Fetch models from OpenRouter API
|
| 34 |
+
try:
|
| 35 |
+
response = requests.get("https://openrouter.ai/api/v1/models")
|
| 36 |
+
response.raise_for_status() # Raise an exception for bad status codes
|
| 37 |
+
models = response.json()["data"]
|
| 38 |
+
|
| 39 |
+
# Sort models alphabetically by their ID
|
| 40 |
+
models.sort(key=lambda model: model["id"])
|
| 41 |
+
|
| 42 |
+
model_names = [model["id"] for model in models]
|
| 43 |
+
except requests.exceptions.RequestException as e:
|
| 44 |
+
st.error(f"Error fetching models from OpenRouter API: {e}")
|
| 45 |
+
model_names = [] # Provide an empty list if API call fails
|
| 46 |
+
|
| 47 |
+
# Model Selection
|
| 48 |
+
if model_names:
|
| 49 |
+
model_name = st.selectbox("Select a Language Model", model_names)
|
| 50 |
else:
|
| 51 |
+
st.error("No models available. Please check your API connection.")
|
| 52 |
+
st.stop() # Stop execution if no models are available
|
| 53 |
+
|
| 54 |
+
# Initialize session state for user_questions and predefined_questions
|
| 55 |
+
if "user_questions" not in st.session_state:
|
| 56 |
+
st.session_state.user_questions = []
|
| 57 |
+
|
| 58 |
+
# Workflow Selection
|
| 59 |
+
workflow = st.radio("Select Workflow:", ["Use Predefined Questions", "Use User-Defined Questions"])
|
| 60 |
+
|
| 61 |
+
# Handle Predefined Questions
|
| 62 |
+
if workflow == "Use Predefined Questions":
|
| 63 |
+
st.header("Question Selection")
|
| 64 |
+
# Multiselect for predefined questions
|
| 65 |
+
selected_questions = st.multiselect(
|
| 66 |
+
"Select questions to benchmark:",
|
| 67 |
+
predefined_questions,
|
| 68 |
+
predefined_questions # Select all by default
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
# Handle User-Defined Questions
|
| 72 |
+
elif workflow == "Use User-Defined Questions":
|
| 73 |
+
st.header("Question Input")
|
| 74 |
+
|
| 75 |
+
# Input for adding a new question
|
| 76 |
+
new_question = st.text_input("Enter a new question:")
|
| 77 |
+
if st.button("Add Question") and new_question:
|
| 78 |
+
new_question = new_question.strip() # Remove leading/trailing whitespace
|
| 79 |
+
if new_question and new_question not in st.session_state.user_questions:
|
| 80 |
+
st.session_state.user_questions.append(new_question) # Append to session state
|
| 81 |
+
st.success(f"Question '{new_question}' added successfully.")
|
| 82 |
+
else:
|
| 83 |
+
st.warning("Question already exists or is empty!")
|
| 84 |
+
|
| 85 |
+
# Display multiselect with updated user questions
|
| 86 |
+
selected_questions = st.multiselect(
|
| 87 |
+
"Select your custom questions:",
|
| 88 |
+
options=st.session_state.user_questions,
|
| 89 |
+
default=st.session_state.user_questions
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
# Display selected questions
|
| 93 |
+
st.write("Selected Questions:", selected_questions)
|
| 94 |
+
|
| 95 |
+
# Benchmark Execution
|
| 96 |
+
if st.button("Start Benchmark"):
|
| 97 |
+
if not selected_questions:
|
| 98 |
+
st.warning("Please select at least one question.")
|
| 99 |
+
else:
|
| 100 |
+
# Initialize progress bar
|
| 101 |
+
progress_bar = st.progress(0)
|
| 102 |
+
num_questions = len(selected_questions)
|
| 103 |
+
results = [] # List to store results
|
| 104 |
+
|
| 105 |
+
# Iterate through selected questions
|
| 106 |
+
for i, question in enumerate(selected_questions):
|
| 107 |
+
# Display current question
|
| 108 |
+
st.write(f"Processing question {i+1}/{num_questions}: {question}")
|
| 109 |
+
|
| 110 |
+
previous_answers = []
|
| 111 |
+
question_novelty = 0
|
| 112 |
+
|
| 113 |
+
try:
|
| 114 |
+
while True:
|
| 115 |
+
gen_prompt = create_gen_prompt(question, previous_answers)
|
| 116 |
+
|
| 117 |
+
try:
|
| 118 |
+
new_answer = chat_with_model(
|
| 119 |
+
prompt=gen_prompt,
|
| 120 |
+
model=model_name,
|
| 121 |
+
open_router_key=st.session_state.open_router_key,
|
| 122 |
+
openai_api_key=st.session_state.openai_api_key
|
| 123 |
+
)
|
| 124 |
+
except requests.exceptions.RequestException as e:
|
| 125 |
+
st.error(f"API Error: {e}")
|
| 126 |
+
break
|
| 127 |
+
|
| 128 |
+
judge_prompt = create_judge_prompt(question, new_answer)
|
| 129 |
+
judge = "openai/gpt-4o-mini"
|
| 130 |
+
|
| 131 |
+
try:
|
| 132 |
+
judge_response = chat_with_model(
|
| 133 |
+
prompt=judge_prompt,
|
| 134 |
+
model=judge,
|
| 135 |
+
open_router_key=st.session_state.open_router_key,
|
| 136 |
+
openai_api_key=st.session_state.openai_api_key
|
| 137 |
+
)
|
| 138 |
+
except requests.exceptions.RequestException as e:
|
| 139 |
+
st.error(f"API Error (Judge): {e}")
|
| 140 |
+
break
|
| 141 |
+
|
| 142 |
+
coherence_score = int(judge_response.split("<coherence_score>")[1].split("</coherence_score>")[0])
|
| 143 |
+
|
| 144 |
+
if coherence_score <= 3:
|
| 145 |
+
st.warning("Output is incoherent. Moving to next question.")
|
| 146 |
+
break
|
| 147 |
+
|
| 148 |
+
novelty_score = get_novelty_score(new_answer, previous_answers, st.session_state.openai_api_key)
|
| 149 |
+
|
| 150 |
+
if novelty_score < 0.1:
|
| 151 |
+
st.warning("Output is redundant. Moving to next question.")
|
| 152 |
+
break
|
| 153 |
+
|
| 154 |
+
st.write(f"New Answer:\n{new_answer}")
|
| 155 |
+
st.write(f"Coherence Score: {coherence_score}")
|
| 156 |
+
st.write(f"Novelty Score: {novelty_score}")
|
| 157 |
+
|
| 158 |
+
previous_answers.append(new_answer)
|
| 159 |
+
question_novelty += novelty_score
|
| 160 |
+
|
| 161 |
+
except Exception as e:
|
| 162 |
+
st.error(f"Error processing question: {e}")
|
| 163 |
+
|
| 164 |
+
results.append({
|
| 165 |
+
"question": question,
|
| 166 |
+
"answers": previous_answers,
|
| 167 |
+
"coherence_score": coherence_score,
|
| 168 |
+
"novelty_score": novelty_score
|
| 169 |
})
|
| 170 |
+
|
| 171 |
+
# Update progress bar
|
| 172 |
+
progress_bar.progress((i + 1) / num_questions)
|
| 173 |
+
|
| 174 |
+
st.success("Benchmark completed!")
|
| 175 |
+
|
| 176 |
+
# Display results in a table
|
| 177 |
+
st.write("Results:")
|
| 178 |
+
results_table = []
|
| 179 |
+
for result in results:
|
| 180 |
+
for answer in result["answers"]:
|
| 181 |
+
results_table.append({
|
| 182 |
+
"Question": result["question"],
|
| 183 |
+
"Answer": answer,
|
| 184 |
+
"Coherence Score": result["coherence_score"],
|
| 185 |
+
"Novelty Score": result["novelty_score"]
|
| 186 |
+
})
|
| 187 |
+
st.table(results_table)
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
else:
|
| 191 |
+
st.warning("Please confirm your API keys first.")
|
main.py
CHANGED
|
@@ -79,25 +79,25 @@ def process_question(question, model_name):
|
|
| 79 |
return question_novelty
|
| 80 |
|
| 81 |
|
| 82 |
-
def get_novelty_score(new_answer: str, previous_answers: list):
|
| 83 |
-
|
| 84 |
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
|
| 89 |
-
|
| 90 |
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
|
| 97 |
-
|
| 98 |
-
|
| 99 |
|
| 100 |
-
|
| 101 |
|
| 102 |
|
| 103 |
def benchmark_model_multithreaded(model_name):
|
|
|
|
| 79 |
return question_novelty
|
| 80 |
|
| 81 |
|
| 82 |
+
def get_novelty_score(new_answer: str, previous_answers: list, openai_api_key=None):
|
| 83 |
+
new_embedding = embed(new_answer, openai_api_key)
|
| 84 |
|
| 85 |
+
# If there are no previous answers, return maximum novelty
|
| 86 |
+
if not previous_answers:
|
| 87 |
+
return 1.0
|
| 88 |
|
| 89 |
+
previous_embeddings = [embed(answer, openai_api_key) for answer in previous_answers]
|
| 90 |
|
| 91 |
+
similarities = [
|
| 92 |
+
np.dot(new_embedding, prev_embedding) /
|
| 93 |
+
(np.linalg.norm(new_embedding) * np.linalg.norm(prev_embedding))
|
| 94 |
+
for prev_embedding in previous_embeddings
|
| 95 |
+
]
|
| 96 |
|
| 97 |
+
max_similarity = max(similarities)
|
| 98 |
+
novelty = 1 - max_similarity
|
| 99 |
|
| 100 |
+
return novelty
|
| 101 |
|
| 102 |
|
| 103 |
def benchmark_model_multithreaded(model_name):
|
models.py
CHANGED
|
@@ -5,11 +5,17 @@ from retry import retry
|
|
| 5 |
|
| 6 |
|
| 7 |
@retry(tries=3)
|
| 8 |
-
def chat_with_model(prompt, model, max_tokens=4000, temperature=0):
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
response = client.chat.completions.create(
|
| 14 |
model=model,
|
| 15 |
messages=[
|
|
@@ -26,8 +32,11 @@ def chat_with_model(prompt, model, max_tokens=4000, temperature=0):
|
|
| 26 |
|
| 27 |
@lru_cache(maxsize=10000)
|
| 28 |
@retry(tries=3)
|
| 29 |
-
def embed(text):
|
| 30 |
-
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
response = client.embeddings.create(
|
| 33 |
model="text-embedding-3-large", input=[text])
|
|
|
|
| 5 |
|
| 6 |
|
| 7 |
@retry(tries=3)
|
| 8 |
+
def chat_with_model(prompt, model, open_router_key=None, openai_api_key=None, max_tokens=4000, temperature=0):
|
| 9 |
+
if open_router_key:
|
| 10 |
+
client = OpenAI(
|
| 11 |
+
api_key=open_router_key,
|
| 12 |
+
base_url="https://openrouter.ai/api/v1"
|
| 13 |
+
)
|
| 14 |
+
elif openai_api_key:
|
| 15 |
+
client = OpenAI(api_key=openai_api_key)
|
| 16 |
+
else:
|
| 17 |
+
raise ValueError("Either open_router_key or openai_api_key must be provided.")
|
| 18 |
+
|
| 19 |
response = client.chat.completions.create(
|
| 20 |
model=model,
|
| 21 |
messages=[
|
|
|
|
| 32 |
|
| 33 |
@lru_cache(maxsize=10000)
|
| 34 |
@retry(tries=3)
|
| 35 |
+
def embed(text, openai_api_key=None):
|
| 36 |
+
if openai_api_key:
|
| 37 |
+
client = OpenAI(api_key=openai_api_key)
|
| 38 |
+
else:
|
| 39 |
+
raise ValueError("openai_api_key must be provided.")
|
| 40 |
|
| 41 |
response = client.embeddings.create(
|
| 42 |
model="text-embedding-3-large", input=[text])
|