huamnifierWithSimpleGrammer

Running

File size: 5,721 Bytes

# Import dependencies
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSequenceClassification, T5Tokenizer, T5ForConditionalGeneration
import torch
import nltk
import random
import string

# Download NLTK data (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')

# Load AI Detector model and tokenizer from Hugging Face (DistilBERT)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

# Load SRDdev Paraphrase model and tokenizer for humanizing text
paraphrase_tokenizer = T5Tokenizer.from_pretrained("SRDdev/Paraphrase")
paraphrase_model = T5ForConditionalGeneration.from_pretrained("SRDdev/Paraphrase")

# AI detection function using DistilBERT
def detect_ai_generated(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    probabilities = torch.softmax(outputs.logits, dim=1)
    ai_probability = probabilities[0][1].item()  # Probability of being AI-generated
    return ai_probability

# Random text transformations to simulate human-like errors
def random_capitalize(word):
    if word.isalpha() and random.random() < 0.1:
        return word.capitalize()
    return word

def random_remove_punctuation(text):
    if random.random() < 0.2:
        text = list(text)
        indices = [i for i, c in enumerate(text) if c in string.punctuation]
        if indices:
            remove_indices = random.sample(indices, min(3, len(indices)))
            for idx in sorted(remove_indices, reverse=True):
                text.pop(idx)
        return ''.join(text)
    return text

def random_double_period(text):
    if random.random() < 0.2:
        text = text.replace('.', '..', 3)
    return text

def random_double_space(text):
    if random.random() < 0.2:
        words = text.split()
        for _ in range(min(3, len(words) - 1)):
            idx = random.randint(0, len(words) - 2)
            words[idx] += '  '
        return ' '.join(words)
    return text

def random_replace_comma_space(text, period_replace_percentage=0.33):
    comma_occurrences = text.count(", ")
    period_occurrences = text.count(". ")
    replace_count_comma = max(1, comma_occurrences // 3)
    replace_count_period = max(1, period_occurrences // 3)
    comma_indices = [i for i in range(len(text)) if text.startswith(", ", i)]
    period_indices = [i for i in range(len(text)) if text.startswith(". ", i)]
    replace_indices_comma = random.sample(comma_indices, min(replace_count_comma, len(comma_indices)))
    replace_indices_period = random.sample(period_indices, min(replace_count_period, len(period_indices)))
    for idx in sorted(replace_indices_comma + replace_indices_period, reverse=True):
        if text.startswith(", ", idx):
            text = text[:idx] + " ," + text[idx + 2:]
        if text.startswith(". ", idx):
            text = text[:idx] + " ." + text[idx + 2:]
    return text

def transform_paragraph(paragraph):
    words = paragraph.split()
    if len(words) > 12:
        words = [random_capitalize(word) for word in words]
        transformed_paragraph = ' '.join(words)
        transformed_paragraph = random_remove_punctuation(transformed_paragraph)
        transformed_paragraph = random_double_period(transformed_paragraph)
        transformed_paragraph = random_double_space(transformed_paragraph)
        transformed_paragraph = random_replace_comma_space(transformed_paragraph)
    else:
        transformed_paragraph = paragraph
    return transformed_paragraph

def transform_text(text):
    paragraphs = text.split('\n')
    transformed_paragraphs = [transform_paragraph(paragraph) for paragraph in paragraphs]
    return '\n'.join(transformed_paragraphs)

# Humanize the AI-detected text using the SRDdev Paraphrase model
def humanize_text(AI_text):
    paragraphs = AI_text.split("\n")
    paraphrased_paragraphs = []
    for paragraph in paragraphs:
        if paragraph.strip():
            inputs = paraphrase_tokenizer(paragraph, return_tensors="pt", max_length=512, truncation=True)
            paraphrased_ids = paraphrase_model.generate(
                inputs['input_ids'],
                max_length=inputs['input_ids'].shape[-1] + 20,  # Slightly more than the original input length
                num_beams=4,
                early_stopping=True,
                length_penalty=1.0,
                no_repeat_ngram_size=3,
            )
            paraphrased_text = paraphrase_tokenizer.decode(paraphrased_ids[0], skip_special_tokens=True)
            paraphrased_paragraphs.append(paraphrased_text)
    return "\n\n".join(paraphrased_paragraphs)

# Main function to handle the overall process
def main_function(AI_text):
    ai_probabilities = [detect_ai_generated(sentence) for sentence in nltk.sent_tokenize(AI_text)]
    ai_generated_percentage = sum([1 for prob in ai_probabilities if prob > 0.5]) / len(ai_probabilities) * 100
    
    # Transform AI text to make it more human-like
    humanized_text = humanize_text(AI_text)
    humanized_text = transform_text(humanized_text)  # Add randomness to simulate human errors
    
    return f"AI-Generated Content: {ai_generated_percentage:.2f}%\n\nHumanized Text:\n{humanized_text}"

# Gradio interface definition
interface = gr.Interface(
    fn=main_function,
    inputs="textbox",
    outputs="textbox",
    title="AI Text Humanizer",
    description="Enter AI-generated text and get a human-written version. This space uses models from Hugging Face directly."
)

# Launch the Gradio app
interface.launch(debug=True)