import os
import json
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch
from sklearn.metrics import f1_score
import re
from collections import Counter
import string
from huggingface_hub import login
import gradio as gr
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt

# Normalization functions (same as extractor)
def normalize_answer(s):
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)
    def white_space_fix(text):
        return ' '.join(text.split())
    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)
    def lower(text):
        return text.lower()
    return white_space_fix(remove_articles(remove_punc(lower(s))))

def f1_score_qa(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    return (2 * precision * recall) / (precision + recall)

def exact_match_score(prediction, ground_truth):
    return normalize_answer(prediction) == normalize_answer(ground_truth)

# Identical confidence calculation to extractor
def calculate_confidence(model, tokenizer, question, context):
    inputs = tokenizer(
        question, 
        context, 
        return_tensors="pt",
        truncation=True,
        max_length=512,
        stride=128,
        padding=True
    )
    
    if torch.cuda.is_available():
        inputs = {k: v.cuda() for k, v in inputs.items()}
        model = model.cuda()

    with torch.no_grad():
        outputs = model(**inputs)

    start_probs = torch.softmax(outputs.start_logits, dim=1)
    end_probs = torch.softmax(outputs.end_logits, dim=1)
    answer_start = torch.argmax(outputs.start_logits)
    answer_end = torch.argmax(outputs.end_logits) + 1
    
    start_prob = start_probs[0, answer_start].item()
    end_prob = end_probs[0, answer_end-1].item()
    confidence = np.sqrt(start_prob * end_prob)
    
    answer_tokens = inputs["input_ids"][0][answer_start:answer_end]
    answer = tokenizer.decode(answer_tokens, skip_special_tokens=True).strip()
    
    return answer, float(confidence)

def run_evaluation(num_samples=100):
    # Authenticate
    if token := os.getenv("HF_TOKEN"):
        login(token=token)
    
    # Load model same as extractor
    model_name = "AvocadoMuffin/roberta-cuad-qa-v2"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForQuestionAnswering.from_pretrained(model_name)
    
    # Load CUAD dataset
    dataset = load_dataset("theatticusproject/cuad-qa", token=token)
    test_data = dataset["test"].select(range(min(num_samples, len(dataset["test"]))))
    
    results = []
    for example in test_data:
        context = example["context"]
        question = example["question"]
        gt_answer = example["answers"]["text"][0] if example["answers"]["text"] else ""
        
        pred_answer, confidence = calculate_confidence(model, tokenizer, question, context)
        
        results.append({
            "question": question,
            "prediction": pred_answer,
            "ground_truth": gt_answer,
            "confidence": confidence,
            "exact_match": exact_match_score(pred_answer, gt_answer),
            "f1": f1_score_qa(pred_answer, gt_answer)
        })
    
    # Generate report
    df = pd.DataFrame(results)
    avg_metrics = {
        "exact_match": df["exact_match"].mean() * 100,
        "f1": df["f1"].mean() * 100,
        "confidence": df["confidence"].mean() * 100
    }
    
    # Confidence calibration analysis
    high_conf_correct = df[(df["confidence"] > 0.8) & (df["exact_match"] == 1)].shape[0]
    high_conf_total = df[df["confidence"] > 0.8].shape[0]
    
    report = f"""
    CUAD Evaluation Report (n={len(df)})
    ========================
    Accuracy:
    - Exact Match: {avg_metrics['exact_match']:.2f}%
    - F1 Score: {avg_metrics['f1']:.2f}%
    
    Confidence Analysis:
    - Avg Confidence: {avg_metrics['confidence']:.2f}%
    - High-Confidence (>80%) Accuracy: {high_conf_correct}/{high_conf_total} ({high_conf_correct/max(1,high_conf_total)*100:.1f}%)
    
    Confidence vs Accuracy:
    {df[['confidence', 'exact_match']].corr().iloc[0,1]:.3f} correlation
    """
    
    # Save results
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    results_file = f"cuad_eval_{timestamp}.json"
    with open(results_file, "w") as f:
        json.dump({
            "metrics": avg_metrics,
            "samples": results,
            "config": {
                "model": model_name,
                "confidence_method": "geometric_mean_start_end_probs"
            }
        }, f, indent=2)
    
    return report, df, results_file

if __name__ == "__main__":
    report, df, _ = run_evaluation()
    print(report)
    print("\nSample predictions:")
    print(df.head())