File size: 5,721 Bytes
29edf23
84669bc
29edf23
 
 
71bcf84
 
936bfca
29edf23
 
 
936bfca
29edf23
 
 
936bfca
29edf23
 
 
35244e7
29edf23
 
 
 
 
 
 
 
10dc1f6
29edf23
10dc1f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29edf23
 
 
 
 
 
 
 
 
 
 
 
 
 
10dc1f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29edf23
84669bc
29edf23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ada2d1a
29edf23
 
 
 
10dc1f6
29edf23
 
 
99b3c08
84669bc
 
29edf23
 
 
 
 
84669bc
776fa07
84669bc
29edf23
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# Import dependencies
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSequenceClassification, T5Tokenizer, T5ForConditionalGeneration
import torch
import nltk
import random
import string

# Download NLTK data (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')

# Load AI Detector model and tokenizer from Hugging Face (DistilBERT)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

# Load SRDdev Paraphrase model and tokenizer for humanizing text
paraphrase_tokenizer = T5Tokenizer.from_pretrained("SRDdev/Paraphrase")
paraphrase_model = T5ForConditionalGeneration.from_pretrained("SRDdev/Paraphrase")

# AI detection function using DistilBERT
def detect_ai_generated(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    probabilities = torch.softmax(outputs.logits, dim=1)
    ai_probability = probabilities[0][1].item()  # Probability of being AI-generated
    return ai_probability

# Random text transformations to simulate human-like errors
def random_capitalize(word):
    if word.isalpha() and random.random() < 0.1:
        return word.capitalize()
    return word

def random_remove_punctuation(text):
    if random.random() < 0.2:
        text = list(text)
        indices = [i for i, c in enumerate(text) if c in string.punctuation]
        if indices:
            remove_indices = random.sample(indices, min(3, len(indices)))
            for idx in sorted(remove_indices, reverse=True):
                text.pop(idx)
        return ''.join(text)
    return text

def random_double_period(text):
    if random.random() < 0.2:
        text = text.replace('.', '..', 3)
    return text

def random_double_space(text):
    if random.random() < 0.2:
        words = text.split()
        for _ in range(min(3, len(words) - 1)):
            idx = random.randint(0, len(words) - 2)
            words[idx] += '  '
        return ' '.join(words)
    return text

def random_replace_comma_space(text, period_replace_percentage=0.33):
    comma_occurrences = text.count(", ")
    period_occurrences = text.count(". ")
    replace_count_comma = max(1, comma_occurrences // 3)
    replace_count_period = max(1, period_occurrences // 3)
    comma_indices = [i for i in range(len(text)) if text.startswith(", ", i)]
    period_indices = [i for i in range(len(text)) if text.startswith(". ", i)]
    replace_indices_comma = random.sample(comma_indices, min(replace_count_comma, len(comma_indices)))
    replace_indices_period = random.sample(period_indices, min(replace_count_period, len(period_indices)))
    for idx in sorted(replace_indices_comma + replace_indices_period, reverse=True):
        if text.startswith(", ", idx):
            text = text[:idx] + " ," + text[idx + 2:]
        if text.startswith(". ", idx):
            text = text[:idx] + " ." + text[idx + 2:]
    return text

def transform_paragraph(paragraph):
    words = paragraph.split()
    if len(words) > 12:
        words = [random_capitalize(word) for word in words]
        transformed_paragraph = ' '.join(words)
        transformed_paragraph = random_remove_punctuation(transformed_paragraph)
        transformed_paragraph = random_double_period(transformed_paragraph)
        transformed_paragraph = random_double_space(transformed_paragraph)
        transformed_paragraph = random_replace_comma_space(transformed_paragraph)
    else:
        transformed_paragraph = paragraph
    return transformed_paragraph

def transform_text(text):
    paragraphs = text.split('\n')
    transformed_paragraphs = [transform_paragraph(paragraph) for paragraph in paragraphs]
    return '\n'.join(transformed_paragraphs)

# Humanize the AI-detected text using the SRDdev Paraphrase model
def humanize_text(AI_text):
    paragraphs = AI_text.split("\n")
    paraphrased_paragraphs = []
    for paragraph in paragraphs:
        if paragraph.strip():
            inputs = paraphrase_tokenizer(paragraph, return_tensors="pt", max_length=512, truncation=True)
            paraphrased_ids = paraphrase_model.generate(
                inputs['input_ids'],
                max_length=inputs['input_ids'].shape[-1] + 20,  # Slightly more than the original input length
                num_beams=4,
                early_stopping=True,
                length_penalty=1.0,
                no_repeat_ngram_size=3,
            )
            paraphrased_text = paraphrase_tokenizer.decode(paraphrased_ids[0], skip_special_tokens=True)
            paraphrased_paragraphs.append(paraphrased_text)
    return "\n\n".join(paraphrased_paragraphs)

# Main function to handle the overall process
def main_function(AI_text):
    ai_probabilities = [detect_ai_generated(sentence) for sentence in nltk.sent_tokenize(AI_text)]
    ai_generated_percentage = sum([1 for prob in ai_probabilities if prob > 0.5]) / len(ai_probabilities) * 100
    
    # Transform AI text to make it more human-like
    humanized_text = humanize_text(AI_text)
    humanized_text = transform_text(humanized_text)  # Add randomness to simulate human errors
    
    return f"AI-Generated Content: {ai_generated_percentage:.2f}%\n\nHumanized Text:\n{humanized_text}"

# Gradio interface definition
interface = gr.Interface(
    fn=main_function,
    inputs="textbox",
    outputs="textbox",
    title="AI Text Humanizer",
    description="Enter AI-generated text and get a human-written version. This space uses models from Hugging Face directly."
)

# Launch the Gradio app
interface.launch(debug=True)