Spaces:
Sleeping
Sleeping
File size: 3,845 Bytes
d7a8925 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
import asyncio
import json
import os
from datetime import datetime
from test_workflow import run_workflow
from workflow import create_workflow
from generate_test_dataset import GOLDEN_DATASET_DIR, validate_test_case
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
async def run_golden_tests():
"""Run tests using the golden dataset."""
# Load the golden dataset
dataset_path = os.path.join(GOLDEN_DATASET_DIR, "golden_dataset.json")
if not os.path.exists(dataset_path):
print("Golden dataset not found. Generating new dataset...")
from generate_test_dataset import generate_golden_dataset
generate_golden_dataset()
with open(dataset_path, 'r') as f:
golden_dataset = json.load(f)
# Initialize workflow
workflow = create_workflow(os.getenv("TAVILY_API_KEY"))
# Store test results
test_results = {
"metadata": {
"timestamp": datetime.now().isoformat(),
"dataset_version": golden_dataset["metadata"]["version"]
},
"results": []
}
# Run tests for each test case
for test_case in golden_dataset["test_cases"]:
print(f"\nRunning test case: {test_case['input']['query']}")
try:
# Run the workflow
result = await run_workflow(
workflow,
test_case["input"]["query"],
agent_type=test_case["input"]["agent_type"],
context=test_case["input"]["context"]
)
# Validate the results
validation_result = validate_test_case(test_case, result)
# Add results
test_results["results"].append({
"test_case_id": test_case["id"],
"query": test_case["input"]["query"],
"success": all(v["passed"] for v in validation_result["validations"]),
"validation_results": validation_result,
"workflow_output": result
})
# Print progress
success = all(v["passed"] for v in validation_result["validations"])
status = "β
Passed" if success else "β Failed"
print(f"{status} - {test_case['input']['query']}")
except Exception as e:
print(f"β Error running test case: {str(e)}")
test_results["results"].append({
"test_case_id": test_case["id"],
"query": test_case["input"]["query"],
"success": False,
"error": str(e)
})
# Save test results
results_dir = os.path.join(GOLDEN_DATASET_DIR, "results")
os.makedirs(results_dir, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_file = os.path.join(results_dir, f"test_results_{timestamp}.json")
with open(output_file, "w") as f:
json.dump(test_results, f, indent=2)
# Print summary
total_tests = len(test_results["results"])
passed_tests = sum(1 for r in test_results["results"] if r.get("success", False))
print("\n" + "="*50)
print("Test Summary:")
print(f"Total Tests: {total_tests}")
print(f"Passed: {passed_tests}")
print(f"Failed: {total_tests - passed_tests}")
print(f"Success Rate: {(passed_tests/total_tests)*100:.2f}%")
print("="*50)
print(f"\nDetailed results saved to: {output_file}")
if __name__ == "__main__":
print("\n" + "="*50)
print("π§ͺ Running Golden Dataset Tests")
print("="*50)
try:
asyncio.run(run_golden_tests())
except Exception as e:
print(f"\nβ Critical error: {str(e)}")
raise |