Spaces:
Sleeping
Sleeping
Commit
·
feab4b2
1
Parent(s):
bb0f942
gpt-4o as default
Browse files- tests/analysis.py +4 -4
- tests/grader.py +1 -1
- tests/test_e2e.py +1 -1
tests/analysis.py
CHANGED
|
@@ -119,7 +119,7 @@ def run_evaluation(
|
|
| 119 |
if interview_types is None:
|
| 120 |
interview_types = ["ml_design", "math", "ml_theory", "system_design", "sql", "coding"]
|
| 121 |
if grader_models is None:
|
| 122 |
-
grader_models = ["gpt-
|
| 123 |
if llm_configs is None:
|
| 124 |
llm_configs = [None]
|
| 125 |
|
|
@@ -281,7 +281,7 @@ def filter_df(df, prefixes=["problem", "interviewer", "feedback"]):
|
|
| 281 |
return valid_df
|
| 282 |
|
| 283 |
|
| 284 |
-
def generate_analysis_report(df, folder, focus=None, model="gpt-
|
| 285 |
|
| 286 |
client = OpenAI(base_url="https://api.openai.com/v1")
|
| 287 |
|
|
@@ -341,7 +341,7 @@ def analyze_and_improve_segment(df, segment_to_improve=None):
|
|
| 341 |
filtered_df = filtered_df[filtered_df[prefix_columns].mean(axis=1) < th_score]
|
| 342 |
|
| 343 |
# Generating an analysis report
|
| 344 |
-
comments_analysis = generate_analysis_report(filtered_df, None, focus=segment_to_improve, model="gpt-
|
| 345 |
|
| 346 |
# Constructing improvement prompt
|
| 347 |
improvement_prompt = """You want to improve the prompts for LLM interviewer.
|
|
@@ -364,7 +364,7 @@ You can add 1-3 lines to each of prompts if needed, but you can't change or remo
|
|
| 364 |
|
| 365 |
# Making API call to OpenAI
|
| 366 |
client = OpenAI(base_url="https://api.openai.com/v1")
|
| 367 |
-
model = "gpt-
|
| 368 |
messages = [
|
| 369 |
{"role": "system", "content": improvement_prompt},
|
| 370 |
{"role": "user", "content": current_prompts},
|
|
|
|
| 119 |
if interview_types is None:
|
| 120 |
interview_types = ["ml_design", "math", "ml_theory", "system_design", "sql", "coding"]
|
| 121 |
if grader_models is None:
|
| 122 |
+
grader_models = ["gpt-4o"]
|
| 123 |
if llm_configs is None:
|
| 124 |
llm_configs = [None]
|
| 125 |
|
|
|
|
| 281 |
return valid_df
|
| 282 |
|
| 283 |
|
| 284 |
+
def generate_analysis_report(df, folder, focus=None, model="gpt-4o"):
|
| 285 |
|
| 286 |
client = OpenAI(base_url="https://api.openai.com/v1")
|
| 287 |
|
|
|
|
| 341 |
filtered_df = filtered_df[filtered_df[prefix_columns].mean(axis=1) < th_score]
|
| 342 |
|
| 343 |
# Generating an analysis report
|
| 344 |
+
comments_analysis = generate_analysis_report(filtered_df, None, focus=segment_to_improve, model="gpt-4o")
|
| 345 |
|
| 346 |
# Constructing improvement prompt
|
| 347 |
improvement_prompt = """You want to improve the prompts for LLM interviewer.
|
|
|
|
| 364 |
|
| 365 |
# Making API call to OpenAI
|
| 366 |
client = OpenAI(base_url="https://api.openai.com/v1")
|
| 367 |
+
model = "gpt-4o"
|
| 368 |
messages = [
|
| 369 |
{"role": "system", "content": improvement_prompt},
|
| 370 |
{"role": "user", "content": current_prompts},
|
tests/grader.py
CHANGED
|
@@ -5,7 +5,7 @@ from openai import OpenAI
|
|
| 5 |
from tests.testing_prompts import grader_prompt
|
| 6 |
|
| 7 |
|
| 8 |
-
def grade(json_file_path, model="gpt-
|
| 9 |
client = OpenAI(base_url="https://api.openai.com/v1")
|
| 10 |
|
| 11 |
with open(json_file_path) as file:
|
|
|
|
| 5 |
from tests.testing_prompts import grader_prompt
|
| 6 |
|
| 7 |
|
| 8 |
+
def grade(json_file_path, model="gpt-4o", suffix=""):
|
| 9 |
client = OpenAI(base_url="https://api.openai.com/v1")
|
| 10 |
|
| 11 |
with open(json_file_path) as file:
|
tests/test_e2e.py
CHANGED
|
@@ -5,7 +5,7 @@ from concurrent.futures import ThreadPoolExecutor
|
|
| 5 |
|
| 6 |
def complete_and_grade_interview(interview_type):
|
| 7 |
file_path, _ = complete_interview(interview_type, "test", model="gpt-3.5-turbo")
|
| 8 |
-
feedback = grade(file_path, model="gpt-
|
| 9 |
assert feedback["overall_score"] > 0.4
|
| 10 |
return feedback["overall_score"]
|
| 11 |
|
|
|
|
| 5 |
|
| 6 |
def complete_and_grade_interview(interview_type):
|
| 7 |
file_path, _ = complete_interview(interview_type, "test", model="gpt-3.5-turbo")
|
| 8 |
+
feedback = grade(file_path, model="gpt-4o")
|
| 9 |
assert feedback["overall_score"] > 0.4
|
| 10 |
return feedback["overall_score"]
|
| 11 |
|