AI-Text-Detector

Running

File size: 4,961 Bytes

import os
import requests
import subprocess  # Import the subprocess module
from keras.models import load_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
#from nltk.tokenize import word_tokenize  # Assuming you've imported this for word_tokenize
import pickle
import numpy as np
import streamlit as st

# Custom headers for the HTTP request
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
}

# Debugging: Print current working directory initially
st.write(f"Initial Current Working Directory: {os.getcwd()}")

# Check if the model folder exists
zip_file_path = "my_authorship_model_zip.zip"
if not os.path.exists('my_authorship_model'):
    try:
        # Download the model
        model_url = 'https://jaifar.net/ADS/my_authorship_model_zip.zip'
        r = requests.get(model_url, headers=headers)
        r.raise_for_status()

        # Debugging: Check if download is successful by examining content length
        st.write(f"Downloaded model size: {len(r.content)} bytes")

        # Save the downloaded content
        with open(zip_file_path, "wb") as f:
            f.write(r.content)

        # Debugging: Verify that the zip file exists
        if os.path.exists(zip_file_path):
            st.write("Zip file exists")

            # Debugging: List contents of the zip file using unzip
            subprocess.run(['unzip', '-l', zip_file_path])

            # Extract the model using unzip
            unzip_result = subprocess.run(['unzip', '-o', zip_file_path, '-d', 'my_authorship_model'])

            # Debugging: Check unzip exit code (0 means success)
            if unzip_result.returncode == 0:
                st.write("Model folder successfully extracted using unzip")
                # Debugging: List the directory contents after extraction
                st.write("Listing directory contents:")
                st.write(os.listdir('.'))
                
            else:
                st.write("Model folder was not extracted successfully using unzip")
                exit(1)
        else:
            st.write("Zip file does not exist")
            exit(1)
    except Exception as e:
        st.write(f"Failed to download or extract the model: {e}")
        exit(1)
else:
    st.write("Model folder exists")

# Debugging: Print current working directory after extraction
st.write(f"Current Working Directory After Extraction: {os.getcwd()}")


# Debugging: Check if model folder contains required files
try:
    model_files = os.listdir('my_authorship_model')
    st.write(f"Files in model folder: {model_files}")
except Exception as e:
    st.write(f"Could not list files in model folder: {e}")

# Download required files
file_urls = {
    'tokenizer.pkl': 'https://jaifar.net/ADS/tokenizer.pkl',
    'label_encoder.pkl': 'https://jaifar.net/ADS/label_encoder.pkl'
}

for filename, url in file_urls.items():
    try:
        r = requests.get(url, headers=headers)
        r.raise_for_status()
        with open(filename, 'wb') as f:
            f.write(r.content)
    except Exception as e:
        st.write(f"Failed to download {filename}: {e}")
        exit(1)

# Load the saved model
loaded_model = load_model("my_authorship_model")

# Load the saved tokenizer and label encoder
with open('tokenizer.pkl', 'rb') as handle:
    tokenizer = pickle.load(handle)

with open('label_encoder.pkl', 'rb') as handle:
    label_encoder = pickle.load(handle)

max_length = 300  # As defined in the training code

# Function to predict author for new text
def predict_author(new_text, model, tokenizer, label_encoder):
    sequence = tokenizer.texts_to_sequences([new_text])
    padded_sequence = pad_sequences(sequence, maxlen=max_length, padding='post', truncating='post')
    prediction = model.predict(padded_sequence)
    
    predicted_label = label_encoder.inverse_transform([prediction.argmax()])[0]
    
    probabilities = prediction[0]
    author_probabilities = {}
    for idx, prob in enumerate(probabilities):
        author = label_encoder.inverse_transform([idx])[0]
        author_probabilities[author] = prob
    
    return predicted_label, author_probabilities

st.markdown("CNN : version: 1.2")
new_text = st.text_area("Input your text here")
#words_counts = word_tokenize(new_text)  # Changed input_paragraph to new_text
#final_words = len(words_counts)
#st.write('Words counts: ', final_words)

predicted_author, author_probabilities = predict_author(new_text, loaded_model, tokenizer, label_encoder)
sorted_probabilities = sorted(author_probabilities.items(), key=lambda x: x[1], reverse=True)

st.write(f"The text is most likely written by: {predicted_author}")
st.write("Probabilities for each author are (sorted):")
for author, prob in sorted_probabilities:
    st.write(f"{author}: {prob * 100:.2f}%")