File size: 4,397 Bytes
29edf23
84669bc
29edf23
 
 
23a08cd
30196dc
 
936bfca
30196dc
 
 
 
936bfca
7c9a059
 
 
 
 
 
23a08cd
c93f011
 
 
29edf23
 
c93f011
936bfca
29edf23
 
c93f011
35244e7
30196dc
 
 
 
 
 
 
23a08cd
30196dc
 
23a08cd
30196dc
23a08cd
30196dc
 
 
 
 
 
23a08cd
30196dc
 
10dc1f6
30196dc
 
 
 
 
b3aee5e
 
ea28e08
30196dc
84669bc
29edf23
 
 
 
c93f011
b3aee5e
 
 
 
 
 
 
 
 
29edf23
 
 
 
30196dc
ada2d1a
30196dc
 
 
 
 
29edf23
30196dc
 
29edf23
30196dc
99b3c08
84669bc
 
29edf23
 
 
30196dc
 
84669bc
776fa07
84669bc
b3aee5e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
# Import dependencies
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSequenceClassification, T5Tokenizer, T5ForConditionalGeneration
import torch
import nltk
import spacy
from nltk.corpus import wordnet
import subprocess

# Download NLTK data (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')  # Download WordNet

# Download spaCy model if not already installed
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
    nlp = spacy.load("en_core_web_sm")

# Check for GPU and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load AI Detector model and tokenizer from Hugging Face (DistilBERT)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english").to(device)

# Load SRDdev Paraphrase model and tokenizer for humanizing text
paraphrase_tokenizer = T5Tokenizer.from_pretrained("SRDdev/Paraphrase")
paraphrase_model = T5ForConditionalGeneration.from_pretrained("SRDdev/Paraphrase").to(device)

# Function to find synonyms using WordNet via NLTK
def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    return list(synonyms)

# Replace words with synonyms using spaCy and WordNet
def replace_with_synonyms(text):
    doc = nlp(text)
    processed_text = []
    for token in doc:
        synonyms = get_synonyms(token.text.lower())
        if synonyms and token.pos_ in {"NOUN", "VERB", "ADJ", "ADV"}:  # Only replace certain types of words
            replacement = synonyms[0]  # Replace with the first synonym
            if token.is_title:
                replacement = replacement.capitalize()
            processed_text.append(replacement)
        else:
            processed_text.append(token.text)
    return " ".join(processed_text)

# AI detection function using DistilBERT
def detect_ai_generated(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        probabilities = torch.softmax(outputs.logits, dim=1)
    return probabilities[0][1].item()  # Probability of being AI-generated

# Humanize the AI-detected text using the SRDdev Paraphrase model
def humanize_text(AI_text):
    paragraphs = AI_text.split("\n")
    paraphrased_paragraphs = []
    for paragraph in paragraphs:
        if paragraph.strip():
            inputs = paraphrase_tokenizer(paragraph, return_tensors="pt", max_length=512, truncation=True).to(device)
            with torch.no_grad():  # Avoid gradient calculations for faster inference
                paraphrased_ids = paraphrase_model.generate(
                    inputs['input_ids'],
                    max_length=inputs['input_ids'].shape[-1] + 20,  # Slightly more than the original input length
                    num_beams=4,
                    early_stopping=True,
                    length_penalty=1.0,
                    no_repeat_ngram_size=3,
                )
            paraphrased_text = paraphrase_tokenizer.decode(paraphrased_ids[0], skip_special_tokens=True)
            paraphrased_paragraphs.append(paraphrased_text)
    return "\n\n".join(paraphrased_paragraphs)

# Main function to handle the overall process
def main_function(AI_text):
    # Replace words with synonyms
    text_with_synonyms = replace_with_synonyms(AI_text)
    
    # Detect AI-generated content
    ai_probability = detect_ai_generated(text_with_synonyms)
    
    # Humanize AI text
    humanized_text = humanize_text(text_with_synonyms)
    
    return f"AI-Generated Content: {ai_probability:.2f}%\n\nHumanized Text:\n{humanized_text}"

# Gradio interface definition
interface = gr.Interface(
    fn=main_function,
    inputs="textbox",
    outputs="textbox",
    title="AI Text Humanizer with Synonym Replacement",
    description="Enter AI-generated text and get a human-written version, with synonyms replaced for more natural output. This space uses models from Hugging Face directly."
)

# Launch the Gradio app
interface.launch(debug=False)  # Turn off debug mode for production