File size: 3,845 Bytes
d7a8925
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import asyncio
import json
import os
from datetime import datetime
from test_workflow import run_workflow
from workflow import create_workflow
from generate_test_dataset import GOLDEN_DATASET_DIR, validate_test_case
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

async def run_golden_tests():
    """Run tests using the golden dataset."""
    
    # Load the golden dataset
    dataset_path = os.path.join(GOLDEN_DATASET_DIR, "golden_dataset.json")
    if not os.path.exists(dataset_path):
        print("Golden dataset not found. Generating new dataset...")
        from generate_test_dataset import generate_golden_dataset
        generate_golden_dataset()
    
    with open(dataset_path, 'r') as f:
        golden_dataset = json.load(f)
    
    # Initialize workflow
    workflow = create_workflow(os.getenv("TAVILY_API_KEY"))
    
    # Store test results
    test_results = {
        "metadata": {
            "timestamp": datetime.now().isoformat(),
            "dataset_version": golden_dataset["metadata"]["version"]
        },
        "results": []
    }
    
    # Run tests for each test case
    for test_case in golden_dataset["test_cases"]:
        print(f"\nRunning test case: {test_case['input']['query']}")
        try:
            # Run the workflow
            result = await run_workflow(
                workflow,
                test_case["input"]["query"],
                agent_type=test_case["input"]["agent_type"],
                context=test_case["input"]["context"]
            )
            
            # Validate the results
            validation_result = validate_test_case(test_case, result)
            
            # Add results
            test_results["results"].append({
                "test_case_id": test_case["id"],
                "query": test_case["input"]["query"],
                "success": all(v["passed"] for v in validation_result["validations"]),
                "validation_results": validation_result,
                "workflow_output": result
            })
            
            # Print progress
            success = all(v["passed"] for v in validation_result["validations"])
            status = "βœ… Passed" if success else "❌ Failed"
            print(f"{status} - {test_case['input']['query']}")
            
        except Exception as e:
            print(f"❌ Error running test case: {str(e)}")
            test_results["results"].append({
                "test_case_id": test_case["id"],
                "query": test_case["input"]["query"],
                "success": False,
                "error": str(e)
            })
    
    # Save test results
    results_dir = os.path.join(GOLDEN_DATASET_DIR, "results")
    os.makedirs(results_dir, exist_ok=True)
    
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_file = os.path.join(results_dir, f"test_results_{timestamp}.json")
    
    with open(output_file, "w") as f:
        json.dump(test_results, f, indent=2)
    
    # Print summary
    total_tests = len(test_results["results"])
    passed_tests = sum(1 for r in test_results["results"] if r.get("success", False))
    
    print("\n" + "="*50)
    print("Test Summary:")
    print(f"Total Tests: {total_tests}")
    print(f"Passed: {passed_tests}")
    print(f"Failed: {total_tests - passed_tests}")
    print(f"Success Rate: {(passed_tests/total_tests)*100:.2f}%")
    print("="*50)
    print(f"\nDetailed results saved to: {output_file}")

if __name__ == "__main__":
    print("\n" + "="*50)
    print("πŸ§ͺ Running Golden Dataset Tests")
    print("="*50)
    
    try:
        asyncio.run(run_golden_tests())
    except Exception as e:
        print(f"\n❌ Critical error: {str(e)}")
        raise