# Import dependencies
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSequenceClassification, T5Tokenizer, T5ForConditionalGeneration
import torch
import nltk
import random
import string
import spacy
import subprocess  # Import subprocess for downloading spaCy models

# Download NLTK data (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')  # Download WordNet for enhanced synonym lookup

# Download spaCy model if not already installed
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
    nlp = spacy.load("en_core_web_sm")

# Check for GPU and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load AI Detector model and tokenizer from Hugging Face (DistilBERT)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english").to(device)

# Load SRDdev Paraphrase model and tokenizer for humanizing text
paraphrase_tokenizer = T5Tokenizer.from_pretrained("SRDdev/Paraphrase")
paraphrase_model = T5ForConditionalGeneration.from_pretrained("SRDdev/Paraphrase").to(device)

# AI detection function using DistilBERT with batch processing
def detect_ai_generated(texts):
    inputs = tokenizer(texts, return_tensors="pt", truncation=True, max_length=512, padding=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    probabilities = torch.softmax(outputs.logits, dim=1)[:, 1].cpu().tolist()  # List of AI-generated probabilities
    return probabilities

# Synonym replacement using spaCy
def replace_with_synonyms(text, probability=0.3):
    doc = nlp(text)
    new_text = []
    for token in doc:
        if random.random() < probability and token.pos_ in ("NOUN", "VERB", "ADJ", "ADV"):
            synonyms = [synonym.lemma_ for synonym in token.vocab if synonym.is_lower == token.is_lower]
            if synonyms:
                new_word = random.choice(synonyms)
                new_text.append(new_word)
            else:
                new_text.append(token.text)
        else:
            new_text.append(token.text)
    return " ".join(new_text)

# Random text transformations to simulate human-like errors
def random_capitalize(word):
    if word.isalpha() and random.random() < 0.1:
        return word.capitalize()
    return word

def random_remove_punctuation(text):
    if random.random() < 0.2:
        text = list(text)
        indices = [i for i, c in enumerate(text) if c in string.punctuation]
        if indices:
            remove_indices = random.sample(indices, min(3, len(indices)))
            for idx in sorted(remove_indices, reverse=True):
                text.pop(idx)
        return ''.join(text)
    return text

def random_double_period(text):
    if random.random() < 0.2:
        text = text.replace('.', '..', 3)
    return text

def random_double_space(text):
    if random.random() < 0.2:
        words = text.split()
        for _ in range(min(3, len(words) - 1)):
            idx = random.randint(0, len(words) - 2)
            words[idx] += '  '
        return ' '.join(words)
    return text

def random_replace_comma_space(text, period_replace_percentage=0.33):
    comma_occurrences = text.count(", ")
    period_occurrences = text.count(". ")
    replace_count_comma = max(1, comma_occurrences // 3)
    replace_count_period = max(1, period_occurrences // 3)
    comma_indices = [i for i in range(len(text)) if text.startswith(", ", i)]
    period_indices = [i for i in range(len(text)) if text.startswith(". ", i)]
    replace_indices_comma = random.sample(comma_indices, min(replace_count_comma, len(comma_indices)))
    replace_indices_period = random.sample(period_indices, min(replace_count_period, len(period_indices)))
    for idx in sorted(replace_indices_comma + replace_indices_period, reverse=True):
        if text.startswith(", ", idx):
            text = text[:idx] + " ," + text[idx + 2:]
        if text.startswith(". ", idx):
            text = text[:idx] + " ." + text[idx + 2:]
    return text

def transform_paragraph(paragraph):
    words = paragraph.split()
    if len(words) > 12:
        words = [random_capitalize(word) for word in words]
        transformed_paragraph = ' '.join(words)
        transformed_paragraph = random_remove_punctuation(transformed_paragraph)
        transformed_paragraph = random_double_period(transformed_paragraph)
        transformed_paragraph = random_double_space(transformed_paragraph)
        transformed_paragraph = random_replace_comma_space(transformed_paragraph)
        transformed_paragraph = replace_with_synonyms(transformed_paragraph)  # Use spaCy for synonyms
    else:
        transformed_paragraph = paragraph
    return transformed_paragraph

def transform_text(text):
    paragraphs = text.split('\n')
    transformed_paragraphs = [transform_paragraph(paragraph) for paragraph in paragraphs]
    return '\n'.join(transformed_paragraphs)

# Humanize the AI-detected text using the SRDdev Paraphrase model with optimized parameters
def humanize_text(AI_text):
    paragraphs = AI_text.split("\n")
    paraphrased_paragraphs = []
    for paragraph in paragraphs:
        if paragraph.strip():
            inputs = paraphrase_tokenizer(paragraph, return_tensors="pt", max_length=512, truncation=True).to(device)
            paraphrased_ids = paraphrase_model.generate(
                inputs['input_ids'],
                max_length=inputs['input_ids'].shape[-1] + 20,
                num_beams=2,  # Reduced beam size for speed
                early_stopping=True,
                length_penalty=0.8,  # Lower penalty to generate faster
                no_repeat_ngram_size=2,  # Reduced for performance
                do_sample=True,  # Enable sampling to add randomness
                top_k=50,  # Top-k sampling
                top_p=0.95,  # Top-p (nucleus) sampling
            )
            paraphrased_text = paraphrase_tokenizer.decode(paraphrased_ids[0], skip_special_tokens=True)
            paraphrased_paragraphs.append(paraphrased_text)
    return "\n\n".join(paraphrased_paragraphs)

# Main function to handle the overall process with batch processing
def main_function(AI_text):
    sentences = nltk.sent_tokenize(AI_text)
    ai_probabilities = detect_ai_generated(sentences)
    ai_generated_percentage = sum([1 for prob in ai_probabilities if prob > 0.5]) / len(ai_probabilities) * 100
    
    # Transform AI text to make it more human-like
    humanized_text = humanize_text(AI_text)
    humanized_text = transform_text(humanized_text)  # Add randomness to simulate human errors
    
    return f"AI-Generated Content: {ai_generated_percentage:.2f}%\n\nHumanized Text:\n{humanized_text}"

# Gradio interface definition
interface = gr.Interface(
    fn=main_function,
    inputs="textbox",
    outputs="textbox",
    title="AI Text Humanizer",
    description="Enter AI-generated text and get a human-written version. This space uses models from Hugging Face directly."
)

# Launch the Gradio app
interface.launch(debug=True)