import os
import json
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch
from collections import Counter
import string
import pandas as pd
from datetime import datetime

# Normalization functions
def normalize_answer(s):
    def remove_articles(text): return re.sub(r'\b(a|an|the)\b', ' ', text)
    def white_space_fix(text): return ' '.join(text.split())
    def remove_punc(text): 
        return ''.join(ch for ch in text if ch not in set(string.punctuation))
    def lower(text): return text.lower()
    return white_space_fix(remove_articles(remove_punc(lower(s))))

# Metrics
def exact_match_score(pred, truth):
    return int(normalize_answer(pred) == normalize_answer(truth))

def f1_score_qa(pred, truth):
    pred_tokens = normalize_answer(pred).split()
    truth_tokens = normalize_answer(truth).split()
    common = Counter(pred_tokens) & Counter(truth_tokens)
    num_same = sum(common.values())
    if num_same == 0: return 0
    precision = num_same / len(pred_tokens)
    recall = num_same / len(truth_tokens)
    return (2 * precision * recall) / (precision + recall)

# Identical to extractor's QA confidence
def get_qa_confidence(model, tokenizer, question, context):
    inputs = tokenizer(
        question, context,
        return_tensors="pt",
        truncation=True,
        max_length=512,
        stride=128,
        padding=True
    )
    if torch.cuda.is_available():
        inputs = {k:v.cuda() for k,v in inputs.items()}
        model = model.cuda()
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    start_probs = torch.softmax(outputs.start_logits, dim=1)
    end_probs = torch.softmax(outputs.end_logits, dim=1)
    answer_start = torch.argmax(outputs.start_logits)
    answer_end = torch.argmax(outputs.end_logits) + 1
    
    confidence = np.sqrt(
        start_probs[0, answer_start].item() * 
        end_probs[0, answer_end-1].item()
    )
    
    answer_tokens = inputs["input_ids"][0][answer_start:answer_end]
    answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)
    return answer.strip(), float(confidence)

def run_evaluation(num_samples=100):
    # Load CUAD with remote code trust
    dataset = load_dataset(
        "theatticusproject/cuad-qa",
        trust_remote_code=True,
        token=os.getenv("HF_TOKEN", True)  # True allows anonymous access
    )
    test_data = dataset["test"].select(range(min(num_samples, len(dataset["test"]))))
    
    # Load model
    model_name = "AvocadoMuffin/roberta-cuad-qa-v2"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForQuestionAnswering.from_pretrained(model_name)
    
    results = []
    for example in test_data:
        context = example["context"]
        question = example["question"]
        gt_answer = example["answers"]["text"][0] if example["answers"]["text"] else ""
        
        pred, conf = get_qa_confidence(model, tokenizer, question, context)
        
        results.append({
            "question": question[:100] + "..." if len(question) > 100 else question,
            "prediction": pred,
            "confidence": conf,
            "exact_match": exact_match_score(pred, gt_answer),
            "f1": f1_score_qa(pred, gt_answer),
            "ground_truth": gt_answer
        })
    
    # Generate report
    df = pd.DataFrame(results)
    report = f"""
    Evaluation Results (n={len(df)})
    =================
    Exact Match: {df['exact_match'].mean():.1%}
    F1 Score: {df['f1'].mean():.1%}
    Avg Confidence: {df['confidence'].mean():.1%}
    High-Confidence Accuracy: {
        df[df['confidence'] > 0.8]['exact_match'].mean():.1%}
    """
    
    # Save
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    results_file = f"eval_results_{timestamp}.json"
    with open(results_file, 'w') as f:
        json.dump({
            "config": {"model": model_name, "dataset": "cuad-qa"},
            "metrics": {
                "exact_match": float(df['exact_match'].mean()),
                "f1": float(df['f1'].mean()),
                "confidence": float(df['confidence'].mean())
            },
            "samples": results
        }, f, indent=2)
    
    return report, df, results_file

if __name__ == "__main__":
    report, df, _ = run_evaluation(num_samples=50)
    print(report)
    print("\nSample predictions:")
    print(df[["question", "confidence", "exact_match"]].head())