Spaces:
Running
Running
File size: 5,721 Bytes
29edf23 84669bc 29edf23 71bcf84 936bfca 29edf23 936bfca 29edf23 936bfca 29edf23 35244e7 29edf23 10dc1f6 29edf23 10dc1f6 29edf23 10dc1f6 29edf23 84669bc 29edf23 ada2d1a 29edf23 10dc1f6 29edf23 99b3c08 84669bc 29edf23 84669bc 776fa07 84669bc 29edf23 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
# Import dependencies
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSequenceClassification, T5Tokenizer, T5ForConditionalGeneration
import torch
import nltk
import random
import string
# Download NLTK data (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')
# Load AI Detector model and tokenizer from Hugging Face (DistilBERT)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
# Load SRDdev Paraphrase model and tokenizer for humanizing text
paraphrase_tokenizer = T5Tokenizer.from_pretrained("SRDdev/Paraphrase")
paraphrase_model = T5ForConditionalGeneration.from_pretrained("SRDdev/Paraphrase")
# AI detection function using DistilBERT
def detect_ai_generated(text):
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
with torch.no_grad():
outputs = model(**inputs)
probabilities = torch.softmax(outputs.logits, dim=1)
ai_probability = probabilities[0][1].item() # Probability of being AI-generated
return ai_probability
# Random text transformations to simulate human-like errors
def random_capitalize(word):
if word.isalpha() and random.random() < 0.1:
return word.capitalize()
return word
def random_remove_punctuation(text):
if random.random() < 0.2:
text = list(text)
indices = [i for i, c in enumerate(text) if c in string.punctuation]
if indices:
remove_indices = random.sample(indices, min(3, len(indices)))
for idx in sorted(remove_indices, reverse=True):
text.pop(idx)
return ''.join(text)
return text
def random_double_period(text):
if random.random() < 0.2:
text = text.replace('.', '..', 3)
return text
def random_double_space(text):
if random.random() < 0.2:
words = text.split()
for _ in range(min(3, len(words) - 1)):
idx = random.randint(0, len(words) - 2)
words[idx] += ' '
return ' '.join(words)
return text
def random_replace_comma_space(text, period_replace_percentage=0.33):
comma_occurrences = text.count(", ")
period_occurrences = text.count(". ")
replace_count_comma = max(1, comma_occurrences // 3)
replace_count_period = max(1, period_occurrences // 3)
comma_indices = [i for i in range(len(text)) if text.startswith(", ", i)]
period_indices = [i for i in range(len(text)) if text.startswith(". ", i)]
replace_indices_comma = random.sample(comma_indices, min(replace_count_comma, len(comma_indices)))
replace_indices_period = random.sample(period_indices, min(replace_count_period, len(period_indices)))
for idx in sorted(replace_indices_comma + replace_indices_period, reverse=True):
if text.startswith(", ", idx):
text = text[:idx] + " ," + text[idx + 2:]
if text.startswith(". ", idx):
text = text[:idx] + " ." + text[idx + 2:]
return text
def transform_paragraph(paragraph):
words = paragraph.split()
if len(words) > 12:
words = [random_capitalize(word) for word in words]
transformed_paragraph = ' '.join(words)
transformed_paragraph = random_remove_punctuation(transformed_paragraph)
transformed_paragraph = random_double_period(transformed_paragraph)
transformed_paragraph = random_double_space(transformed_paragraph)
transformed_paragraph = random_replace_comma_space(transformed_paragraph)
else:
transformed_paragraph = paragraph
return transformed_paragraph
def transform_text(text):
paragraphs = text.split('\n')
transformed_paragraphs = [transform_paragraph(paragraph) for paragraph in paragraphs]
return '\n'.join(transformed_paragraphs)
# Humanize the AI-detected text using the SRDdev Paraphrase model
def humanize_text(AI_text):
paragraphs = AI_text.split("\n")
paraphrased_paragraphs = []
for paragraph in paragraphs:
if paragraph.strip():
inputs = paraphrase_tokenizer(paragraph, return_tensors="pt", max_length=512, truncation=True)
paraphrased_ids = paraphrase_model.generate(
inputs['input_ids'],
max_length=inputs['input_ids'].shape[-1] + 20, # Slightly more than the original input length
num_beams=4,
early_stopping=True,
length_penalty=1.0,
no_repeat_ngram_size=3,
)
paraphrased_text = paraphrase_tokenizer.decode(paraphrased_ids[0], skip_special_tokens=True)
paraphrased_paragraphs.append(paraphrased_text)
return "\n\n".join(paraphrased_paragraphs)
# Main function to handle the overall process
def main_function(AI_text):
ai_probabilities = [detect_ai_generated(sentence) for sentence in nltk.sent_tokenize(AI_text)]
ai_generated_percentage = sum([1 for prob in ai_probabilities if prob > 0.5]) / len(ai_probabilities) * 100
# Transform AI text to make it more human-like
humanized_text = humanize_text(AI_text)
humanized_text = transform_text(humanized_text) # Add randomness to simulate human errors
return f"AI-Generated Content: {ai_generated_percentage:.2f}%\n\nHumanized Text:\n{humanized_text}"
# Gradio interface definition
interface = gr.Interface(
fn=main_function,
inputs="textbox",
outputs="textbox",
title="AI Text Humanizer",
description="Enter AI-generated text and get a human-written version. This space uses models from Hugging Face directly."
)
# Launch the Gradio app
interface.launch(debug=True)
|