File size: 15,683 Bytes
ac163d9
91871c3
8e0cc30
 
5a0a98f
8e0cc30
 
 
 
 
 
626c58a
e378435
dbe30b4
3b5315f
8e0cc30
 
 
 
 
 
 
55940d8
 
dbe30b4
 
 
 
 
 
 
 
 
 
 
 
 
3b7bbcd
7ceaece
8e0cc30
 
 
 
 
 
 
 
 
 
6651264
8e0cc30
 
 
 
 
 
 
6651264
8e0cc30
91871c3
 
 
 
6651264
 
 
 
 
 
 
 
 
91871c3
8e0cc30
 
 
 
 
 
7ceaece
b3a3de6
8e0cc30
 
 
 
 
 
 
 
 
7ceaece
 
 
 
 
 
 
 
 
b172de1
 
ef4bb93
 
7387c1d
 
 
 
 
 
60e0885
 
 
 
 
 
 
 
 
1f15b23
 
3461ff8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1f15b23
60e0885
1f15b23
 
 
 
 
b683e77
 
 
 
60e0885
d26f6c2
 
 
60e0885
1f15b23
 
 
 
60e0885
1f15b23
 
 
 
 
 
60e0885
6b0535c
60e0885
 
1f15b23
 
 
60e0885
 
6b0535c
60e0885
 
6b0535c
1f15b23
6b0535c
0010ffc
 
1f15b23
 
8e0cc30
7ceaece
8e0cc30
 
 
 
 
 
 
 
 
 
5f8821b
8e0cc30
7ceaece
 
8e0cc30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dcb95ab
a3c831e
8e0cc30
69aa5ac
8e0cc30
 
152158c
 
 
a3c831e
152158c
 
 
fe2bd7f
152158c
fe2bd7f
152158c
fe2bd7f
152158c
fe2bd7f
152158c
fe2bd7f
152158c
fe2bd7f
152158c
fe2bd7f
152158c
fe2bd7f
152158c
fe2bd7f
152158c
fe2bd7f
152158c
fe2bd7f
152158c
fe2bd7f
152158c
 
66df616
9c88d2f
 
 
 
 
 
 
 
 
5f8821b
f2a6bd1
e378435
5f8821b
152158c
 
 
5f8821b
8e0cc30
 
a3c831e
69b40a6
 
 
 
 
 
 
6ee564e
dbe30b4
3b1ad93
1f8b929
 
 
b683e77
 
154534a
b683e77
154534a
 
f9b2f8b
9a91da8
 
a368a75
5f8821b
a7072d3
b8c6dcb
 
 
 
 
f9b2f8b
fc8a7df
b8c6dcb
fc8a7df
 
b8c6dcb
 
 
 
a368a75
71b7c7a
59ebdc7
b8c6dcb
 
 
 
a3c831e
b8c6dcb
dbe30b4
66df616
b8c6dcb
 
 
 
 
3d2c2a7
b8c6dcb
dbe30b4
66df616
 
299488e
0ac2f46
dbe30b4
299488e
66df616
 
 
b8c6dcb
9a91da8
b8c6dcb
8fe2540
3b7bbcd
b3a3de6
3b7bbcd
 
 
 
b3a3de6
 
90f4805
3b7bbcd
b3a3de6
3b7bbcd
626c58a
b3a3de6
626c58a
24ff7ed
626c58a
 
1786f28
626c58a
23a0b81
626c58a
1786f28
 
 
 
 
 
626c58a
1786f28
626c58a
 
 
90f4805
3b7bbcd
9a91da8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
import streamlit as st
import zipfile
import os
import requests
import re
from keras.models import load_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
import pickle
import numpy as np
from PIL import Image
from joblib import load
import math



# Custom headers for the HTTP request
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
}



def get_author_display_name(predicted_author, ridge_prediction, extra_trees_prediction):
    author_map = {
        "googlebard": "Google Bard",
        "gpt3": "ChatGPT-3",
        "gpt4": "ChatGPT-4",
        "huggingface": "HuggingChat",
        "human": "Human-Written"
    }
    cnn_predicted_author_display_name = author_map.get(predicted_author, predicted_author)
    ridge_predicted_author_display_name = author_map.get(ridge_prediction[0], ridge_prediction[0])
    extra_trees_predicted_author_display_name = author_map.get(extra_trees_prediction[0], extra_trees_prediction[0])
    
    return cnn_predicted_author_display_name, ridge_predicted_author_display_name, extra_trees_predicted_author_display_name

############# Download Or Check Files/folders exeistince ##############
# Check if the model folder exists
zip_file_path = "my_authorship_model_zip.zip"
if not os.path.exists('my_authorship_model'):
    try:
        # Download the model
        model_url = 'https://jaifar.net/ADS/my_authorship_model_zip.zip'
        r = requests.get(model_url, headers=headers)
        r.raise_for_status()

        # Debugging: Check if download is successful by examining content length
        # st.write(f"Downloaded model size: {len(r.content)} bytes")

        # Save the downloaded content
        with open(zip_file_path, "wb") as f:
            f.write(r.content)

        # Debugging: Verify that the zip file exists
        if os.path.exists(zip_file_path):
            # st.write("Zip file exists")

            # Extract the model using zipfile
            with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
                zip_ref.extractall('my_authorship_model')
                
            # # Debugging: Check if the folder is successfully created
            # if os.path.exists('my_authorship_model'):
            #     # st.write("Model folder successfully extracted using zipfile")
            #     # Debugging: List the directory contents after extraction
            #     # st.write("Listing directory contents:")
            #     # st.write(os.listdir('.'))
            # else:
            #     st.write("Model folder was not extracted successfully using zipfile")
            #     exit(1)

        else:
            st.write("Zip file does not exist")
            exit(1)
    except Exception as e:
        st.write(f"Failed to download or extract the model: {e}")
        exit(1)
else:
     st.write("AI Text Detection")


# Download the required files
file_urls = {
    'tokenizer.pkl': 'https://jaifar.net/ADS/tokenizer.pkl',
    'label_encoder.pkl': 'https://jaifar.net/ADS/label_encoder.pkl'
}

for filename, url in file_urls.items():
    if not os.path.exists(filename):  # Check if the file doesn't exist
        try:
            r = requests.get(url, headers=headers)
            r.raise_for_status()
            with open(filename, 'wb') as f:
                f.write(r.content)
        except Exception as e:
            st.write(f"Failed to download {filename}: {e}")
            exit(1)
    # else:
    #     st.write(f"File {filename} already exists. Skipping download.")
############ download ridge and ExtraTree stuff

# def has_internet_connection():
#     try:
#         response = requests.get("https://www.google.com/", timeout=5)
#         return True
#     except requests.ConnectionError:
#         return False

def is_zip_file(file_path):
    return zipfile.is_zipfile(file_path)

def are_files_extracted(extracted_files, missing_files):
    for file in missing_files:
        if file not in extracted_files:
            return False
    return True

def check_and_download_files():
    file_names = [
        "truncated_260_to_284.xlsx_vectorizer.pkl",
        "not_trancated_full_paragraph.xlsx_extra_trees_model.pkl",
        "not_trancated_full_paragraph.xlsx_ridge_model.pkl",
        "not_trancated_full_paragraph.xlsx_vectorizer.pkl",
        "truncated_10_to_34.xlsx_extra_trees_model.pkl",
        "truncated_10_to_34.xlsx_ridge_model.pkl",
        "truncated_10_to_34.xlsx_vectorizer.pkl",
        "truncated_35_to_59.xlsx_extra_trees_model.pkl",
        "truncated_35_to_59.xlsx_ridge_model.pkl",
        "truncated_35_to_59.xlsx_vectorizer.pkl",
        "truncated_60_to_84.xlsx_extra_trees_model.pkl",
        "truncated_60_to_84.xlsx_ridge_model.pkl",
        "truncated_60_to_84.xlsx_vectorizer.pkl",
        "truncated_85_to_109.xlsx_extra_trees_model.pkl",
        "truncated_85_to_109.xlsx_ridge_model.pkl",
        "truncated_85_to_109.xlsx_vectorizer.pkl",
        "truncated_110_to_134.xlsx_extra_trees_model.pkl",
        "truncated_110_to_134.xlsx_ridge_model.pkl",
        "truncated_110_to_134.xlsx_vectorizer.pkl",
        "truncated_135_to_159.xlsx_extra_trees_model.pkl",
        "truncated_135_to_159.xlsx_ridge_model.pkl",
        "truncated_135_to_159.xlsx_vectorizer.pkl",
        "truncated_160_to_184.xlsx_extra_trees_model.pkl",
        "truncated_160_to_184.xlsx_ridge_model.pkl",
        "truncated_160_to_184.xlsx_vectorizer.pkl",
        "truncated_185_to_209.xlsx_extra_trees_model.pkl",
        "truncated_185_to_209.xlsx_ridge_model.pkl",
        "truncated_185_to_209.xlsx_vectorizer.pkl",
        "truncated_210_to_234.xlsx_extra_trees_model.pkl",
        "truncated_210_to_234.xlsx_ridge_model.pkl",
        "truncated_210_to_234.xlsx_vectorizer.pkl",
        "truncated_235_to_259.xlsx_extra_trees_model.pkl",
        "truncated_235_to_259.xlsx_ridge_model.pkl",
        "truncated_235_to_259.xlsx_vectorizer.pkl",
        "truncated_260_to_284.xlsx_extra_trees_model.pkl",
        "truncated_260_to_284.xlsx_ridge_model.pkl"
    ]
    missing_files = []

    for file_name in file_names:
        if not os.path.exists(file_name):
            missing_files.append(file_name)

    if missing_files:
        #st.write("The following files are missing:")
        st.write("Some files are missing")
        # for file_name in missing_files:
        #     st.write(file_name)
        
        # if not has_internet_connection():
        #     st.write("No internet connection. Cannot download missing files.")
        #     return
        
        try:
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
            }
            url = 'https://jaifar.net/ADS/content.zip'
            response = requests.get(url, headers=headers)
            response.raise_for_status()

            with open('content.zip', 'wb') as zip_file:
                zip_file.write(response.content)

            if not is_zip_file('content.zip'):
                st.write("Downloaded content is not a ZIP file.")
                return

            with zipfile.ZipFile('content.zip', 'r') as zip_ref:
                zip_ref.extractall()

            extracted_files = os.listdir()
            if not are_files_extracted(extracted_files, missing_files):
                st.write("Not all missing files were extracted.")
                return
            
            st.write("content.zip downloaded and extracted successfully.")
        except Exception as e:
            st.write(f"Error downloading or extracting content.zip: {e}")
    # else:
    #     st.write("All files exist.")

check_and_download_files()

############### Load CNN Model ############
# Load the saved model
loaded_model = load_model("my_authorship_model")

# Load the saved tokenizer and label encoder
with open('tokenizer.pkl', 'rb') as handle:
    tokenizer = pickle.load(handle)

with open('label_encoder.pkl', 'rb') as handle:
    label_encoder = pickle.load(handle)

max_length = 300 

############### End Load CNN Model ############

# Function to predict author for new text
def predict_author(new_text, model, tokenizer, label_encoder):
    sequence = tokenizer.texts_to_sequences([new_text])
    padded_sequence = pad_sequences(sequence, maxlen=max_length, padding='post', truncating='post')
    prediction = model.predict(padded_sequence)

    predicted_label = label_encoder.inverse_transform([prediction.argmax()])[0]
    probabilities = prediction[0]
    author_probabilities = {}
    for idx, prob in enumerate(probabilities):
        author = label_encoder.inverse_transform([idx])[0]
        author_probabilities[author] = prob

    return predicted_label, author_probabilities

new_text = st.text_area("Input Your Text Here:")

# Creates a button named 'Press me'
press_me_button = st.button("Human or Robot?")

if press_me_button:
    
    ########## ML 
    word_count = len(re.findall(r'\w+', new_text))
    st.write(f"Words Count: {word_count}")

    # Choose the appropriate model based on word count
    if 10 <= word_count <= 34:
        file_prefix = 'truncated_10_to_34.xlsx'
    elif 35 <= word_count <= 59:
        file_prefix = 'truncated_35_to_59.xlsx'
    elif 60 <= word_count <= 84:
        file_prefix = 'truncated_60_to_84.xlsx'
    elif 85 <= word_count <= 109:
        file_prefix = 'truncated_85_to_109.xlsx'
    elif 110 <= word_count <= 134:
        file_prefix = 'truncated_110_to_134.xlsx'
    elif 135 <= word_count <= 159:
        file_prefix = 'truncated_135_to_159.xlsx'
    elif 160 <= word_count <= 184:
        file_prefix = 'truncated_160_to_184.xlsx'
    elif 185 <= word_count <= 209:
        file_prefix = 'truncated_185_to_209.xlsx'
    elif 210 <= word_count <= 234:
        file_prefix = 'truncated_210_to_234.xlsx'
    elif 235 <= word_count <= 259:
        file_prefix = 'truncated_235_to_259.xlsx'
    elif 260 <= word_count <= 284:
        file_prefix = 'truncated_260_to_284.xlsx'
    else:
        file_prefix = 'not_trancated_full_paragraph.xlsx'
    
    # Load the models and vectorizer
    
    with open(f"{file_prefix}_ridge_model.pkl", 'rb') as file:
        ridge_model = pickle.load(file)
    
    with open(f"{file_prefix}_extra_trees_model.pkl", 'rb') as file:
        extra_trees_model = pickle.load(file)
        
    with open(f"{file_prefix}_vectorizer.pkl", 'rb') as file:
        vectorizer = pickle.load(file)

    # ML Vectorizing the input
    user_input_transformed = vectorizer.transform([new_text])

    # ML predictions
    ridge_prediction = ridge_model.predict(user_input_transformed)
    extra_trees_prediction = extra_trees_model.predict(user_input_transformed)
    
    # CNN prediction + Vectorizing the input
    predicted_author, author_probabilities = predict_author(new_text, loaded_model, tokenizer, label_encoder)
    sorted_probabilities = sorted(author_probabilities.items(), key=lambda x: x[1], reverse=True)
    
    author_map = {
        "googlebard": "Google Bard",
        "gpt3": "ChatGPT-3",
        "gpt4": "ChatGPT-4",
        "huggingface": "HuggingChat",
        "human": "Human-Written"
    }

    cnn_name, ridge_name, extra_trees_name = get_author_display_name(predicted_author, ridge_prediction, extra_trees_prediction)
    with st.expander("Prediction Details (Click Here)..."):
        st.write(f"Ridge: {ridge_name}")
        st.write(f"ExtraTree: {extra_trees_name}")
        st.write(f"CNN: {cnn_name}")
        st.write("_" * 10)
        st.write("CNN Prediction Probabilities:")
        for author, prob in sorted_probabilities:
            display_name = author_map.get(author, author)
            st.write(f"{display_name}: {prob * 100:.2f}%")
            st.progress(float(prob))
            
    max_cnn_prob_name = sorted_probabilities[0][0]
    max_cnn_prob = float(sorted_probabilities[0][1])

    if word_count < 10.0 or word_count > 1081.0:
        st.warning("For better prediction input texts between 10 and 1081", icon="ℹ️")

    elif word_count < 256: 
        if ridge_prediction == extra_trees_prediction == predicted_author:
            st.success(f"Most likely written by: **{cnn_name}**", icon="βœ…")
            st.info("We are quite confident in the accuracy of this result.", icon="ℹ️")
            
        elif ridge_prediction == predicted_author:
            st.success(f"Most likely written by: **{cnn_name}**", icon="βœ…")
            st.success(f"2nd Most likely written by: **{extra_trees_name}**", icon="βœ…")
            st.write("_" * 30)

        elif extra_trees_prediction == predicted_author:
            st.success(f"Most likely written by: **{cnn_name}**", icon="βœ…")
            st.success(f"2nd Most likely written by: **{ridge_name}**", icon="βœ…")
            st.write("_" * 30)

        else:
            st.warning("Notice 1: There is a difficulity predicting your text, it might fill into one of the below:", icon="⚠️")
            st.success(f"1- **{cnn_name}**", icon="βœ…")
            st.success(f"2- **{ridge_name}**", icon="βœ…")
            st.success(f"3- **{extra_trees_name}**", icon="βœ…")
    
    else: 
        if ridge_prediction == extra_trees_prediction == predicted_author:
            st.success(f"Most likely written by: **{ridge_name}**", icon="βœ…")
            st.info("We are quite confident in the accuracy of this result.", icon="ℹ️")

        elif ridge_prediction == predicted_author:
            st.success(f"Most likely written by: **{ridge_name}**", icon="βœ…")
            st.success(f"2nd Most likely written by: **{extra_trees_name}**", icon="βœ…")
            st.write("_" * 30)

        elif ridge_prediction == extra_trees_prediction:
            st.success(f"Most likely written by: **{ridge_name}**", icon="βœ…")
            st.success(f"2nd Most likely written by: **{cnn_name}**", icon="βœ…")
            st.write("_" * 30)

        else:
            st.warning("Notice 1: There is a difficulity predicting your text, it might fill into one of the below:", icon="⚠️")
            st.success(f"1- **{ridge_name}**", icon="βœ…")
            st.success(f"2- **{cnn_name}**", icon="βœ…")
            st.success(f"3- **{extra_trees_name}**", icon="βœ…")


 

# Using expander to make FAQ sections
st.subheader("More about AI Text Detector Project :")

# Small Description
with st.expander("What is this project about?"):
    st.write("""
    This AI Text Detector tells whether a text is written by a Human or a specific Large Language Model (LLM) like ChatGPT-3, ChatGPT-4, Google Bard, or HuggingChat.
    Ridge, Extra trees and CNN are the machine learning algorithms have been used to create this AI Text Detector.
    
    """)
    

# System Details
with st.expander("How does the AI Text Detector work?"):
    st.write("""
    The system is trained using deep learning model on a dataset of 140,546 paragraphs, varying in length from 10 to 1090 words.
    It achieves an accuracy of 0.9964 with a validation loss of 0.094.
    """)

    # Fetch the image from the URL
    accuracy_image_request = requests.get("https://jaifar.net/ADS/best_accuracy.png", headers=headers)
    
    # Save the downloaded content
    image_path = "best_accuracy.png"
    with open(image_path, "wb") as f:
        f.write(accuracy_image_request.content)

    
    # Open the image
    accuracy_image = Image.open(image_path)
    
    # Display the image using streamlit
    st.image(accuracy_image, caption='Best Accuracy', use_column_width=True)