Spaces:
Running
Running
File size: 16,998 Bytes
ac163d9 91871c3 8e0cc30 5a0a98f 8e0cc30 626c58a e378435 dbe30b4 3b5315f 8e0cc30 55940d8 dbe30b4 3b7bbcd 7ceaece 8e0cc30 6651264 8e0cc30 6651264 8e0cc30 91871c3 6651264 91871c3 8e0cc30 7ceaece d7b6034 8e0cc30 7ceaece b172de1 ef4bb93 7387c1d 60e0885 1f15b23 3461ff8 1f15b23 60e0885 1f15b23 6b0535c 1f15b23 6b0535c 60e0885 d26f6c2 60e0885 1f15b23 60e0885 1f15b23 60e0885 6b0535c 60e0885 1f15b23 60e0885 6b0535c 60e0885 6b0535c 1f15b23 6b0535c 0010ffc 1f15b23 8e0cc30 7ceaece 8e0cc30 5f8821b 8e0cc30 7ceaece 8e0cc30 dcb95ab a3c831e 8e0cc30 69aa5ac 8e0cc30 152158c a3c831e 152158c fe2bd7f 152158c fe2bd7f 152158c fe2bd7f 152158c fe2bd7f 152158c fe2bd7f 152158c fe2bd7f 152158c fe2bd7f 152158c fe2bd7f 152158c fe2bd7f 152158c fe2bd7f 152158c fe2bd7f 152158c fe2bd7f 152158c 66df616 9c88d2f 5f8821b f2a6bd1 e378435 5f8821b 152158c 5f8821b 8e0cc30 a3c831e 69b40a6 6ee564e dbe30b4 7771a3e 1f8b929 154534a f9b2f8b 3d2c2a7 a368a75 5f8821b a7072d3 b8c6dcb f9b2f8b fc8a7df b8c6dcb fc8a7df b8c6dcb a368a75 71b7c7a 59ebdc7 b8c6dcb a3c831e b8c6dcb dbe30b4 66df616 b8c6dcb 3d2c2a7 b8c6dcb dbe30b4 66df616 299488e 0ac2f46 dbe30b4 299488e 66df616 b8c6dcb 8fe2540 3b7bbcd d7b6034 3b7bbcd 90f4805 3b7bbcd d7b6034 3b7bbcd 626c58a 24ff7ed 626c58a 1786f28 626c58a 23a0b81 626c58a 1786f28 626c58a 1786f28 626c58a 90f4805 3b7bbcd a295ebd 3b7bbcd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 |
import streamlit as st
import zipfile
import os
import requests
import re
from keras.models import load_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
import pickle
import numpy as np
from PIL import Image
from joblib import load
import math
# Custom headers for the HTTP request
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
}
#################### Load the banner image ##########
# Fetch the image from the URL
banner_image_request = requests.get("https://jaifar.net/ADS/banner.jpg", headers=headers)
# Save the downloaded content
banner_image_path = "banner.jpg"
with open(banner_image_path, "wb") as f:
f.write(banner_image_request.content)
# Open the image
banner_image = Image.open(banner_image_path)
# Display the image using streamlit
st.image(banner_image, caption='', use_column_width=True)
################ end loading banner image ##################
def get_author_display_name(predicted_author, ridge_prediction, extra_trees_prediction):
author_map = {
"googlebard": "Google Bard",
"gpt3": "ChatGPT-3",
"gpt4": "ChatGPT-4",
"huggingface": "HuggingChat",
"human": "Human-Written"
}
cnn_predicted_author_display_name = author_map.get(predicted_author, predicted_author)
ridge_predicted_author_display_name = author_map.get(ridge_prediction[0], ridge_prediction[0])
extra_trees_predicted_author_display_name = author_map.get(extra_trees_prediction[0], extra_trees_prediction[0])
return cnn_predicted_author_display_name, ridge_predicted_author_display_name, extra_trees_predicted_author_display_name
############# Download Or Check Files/folders exeistince ##############
# Check if the model folder exists
zip_file_path = "my_authorship_model_zip.zip"
if not os.path.exists('my_authorship_model'):
try:
# Download the model
model_url = 'https://jaifar.net/ADS/my_authorship_model_zip.zip'
r = requests.get(model_url, headers=headers)
r.raise_for_status()
# Debugging: Check if download is successful by examining content length
# st.write(f"Downloaded model size: {len(r.content)} bytes")
# Save the downloaded content
with open(zip_file_path, "wb") as f:
f.write(r.content)
# Debugging: Verify that the zip file exists
if os.path.exists(zip_file_path):
# st.write("Zip file exists")
# Extract the model using zipfile
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
zip_ref.extractall('my_authorship_model')
# # Debugging: Check if the folder is successfully created
# if os.path.exists('my_authorship_model'):
# # st.write("Model folder successfully extracted using zipfile")
# # Debugging: List the directory contents after extraction
# # st.write("Listing directory contents:")
# # st.write(os.listdir('.'))
# else:
# st.write("Model folder was not extracted successfully using zipfile")
# exit(1)
else:
st.write("Zip file does not exist")
exit(1)
except Exception as e:
st.write(f"Failed to download or extract the model: {e}")
exit(1)
else:
st.write("Version: 1.0")
# Download the required files
file_urls = {
'tokenizer.pkl': 'https://jaifar.net/ADS/tokenizer.pkl',
'label_encoder.pkl': 'https://jaifar.net/ADS/label_encoder.pkl'
}
for filename, url in file_urls.items():
if not os.path.exists(filename): # Check if the file doesn't exist
try:
r = requests.get(url, headers=headers)
r.raise_for_status()
with open(filename, 'wb') as f:
f.write(r.content)
except Exception as e:
st.write(f"Failed to download {filename}: {e}")
exit(1)
# else:
# st.write(f"File {filename} already exists. Skipping download.")
############ download ridge and ExtraTree stuff
# def has_internet_connection():
# try:
# response = requests.get("https://www.google.com/", timeout=5)
# return True
# except requests.ConnectionError:
# return False
def is_zip_file(file_path):
return zipfile.is_zipfile(file_path)
def are_files_extracted(extracted_files, missing_files):
for file in missing_files:
if file not in extracted_files:
return False
return True
def check_and_download_files():
file_names = [
"truncated_260_to_284.xlsx_vectorizer.pkl",
"not_trancated_full_paragraph.xlsx_extra_trees_model.pkl",
"not_trancated_full_paragraph.xlsx_ridge_model.pkl",
"not_trancated_full_paragraph.xlsx_vectorizer.pkl",
"truncated_10_to_34.xlsx_extra_trees_model.pkl",
"truncated_10_to_34.xlsx_ridge_model.pkl",
"truncated_10_to_34.xlsx_vectorizer.pkl",
"truncated_35_to_59.xlsx_extra_trees_model.pkl",
"truncated_35_to_59.xlsx_ridge_model.pkl",
"truncated_35_to_59.xlsx_vectorizer.pkl",
"truncated_60_to_84.xlsx_extra_trees_model.pkl",
"truncated_60_to_84.xlsx_ridge_model.pkl",
"truncated_60_to_84.xlsx_vectorizer.pkl",
"truncated_85_to_109.xlsx_extra_trees_model.pkl",
"truncated_85_to_109.xlsx_ridge_model.pkl",
"truncated_85_to_109.xlsx_vectorizer.pkl",
"truncated_110_to_134.xlsx_extra_trees_model.pkl",
"truncated_110_to_134.xlsx_ridge_model.pkl",
"truncated_110_to_134.xlsx_vectorizer.pkl",
"truncated_135_to_159.xlsx_extra_trees_model.pkl",
"truncated_135_to_159.xlsx_ridge_model.pkl",
"truncated_135_to_159.xlsx_vectorizer.pkl",
"truncated_160_to_184.xlsx_extra_trees_model.pkl",
"truncated_160_to_184.xlsx_ridge_model.pkl",
"truncated_160_to_184.xlsx_vectorizer.pkl",
"truncated_185_to_209.xlsx_extra_trees_model.pkl",
"truncated_185_to_209.xlsx_ridge_model.pkl",
"truncated_185_to_209.xlsx_vectorizer.pkl",
"truncated_210_to_234.xlsx_extra_trees_model.pkl",
"truncated_210_to_234.xlsx_ridge_model.pkl",
"truncated_210_to_234.xlsx_vectorizer.pkl",
"truncated_235_to_259.xlsx_extra_trees_model.pkl",
"truncated_235_to_259.xlsx_ridge_model.pkl",
"truncated_235_to_259.xlsx_vectorizer.pkl",
"truncated_260_to_284.xlsx_extra_trees_model.pkl",
"truncated_260_to_284.xlsx_ridge_model.pkl"
]
missing_files = []
for file_name in file_names:
if not os.path.exists(file_name):
missing_files.append(file_name)
if missing_files:
st.write("The following files are missing:")
for file_name in missing_files:
st.write(file_name)
# if not has_internet_connection():
# st.write("No internet connection. Cannot download missing files.")
# return
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
}
url = 'https://jaifar.net/ADS/content.zip'
response = requests.get(url, headers=headers)
response.raise_for_status()
with open('content.zip', 'wb') as zip_file:
zip_file.write(response.content)
if not is_zip_file('content.zip'):
st.write("Downloaded content is not a ZIP file.")
return
with zipfile.ZipFile('content.zip', 'r') as zip_ref:
zip_ref.extractall()
extracted_files = os.listdir()
if not are_files_extracted(extracted_files, missing_files):
st.write("Not all missing files were extracted.")
return
st.write("content.zip downloaded and extracted successfully.")
except Exception as e:
st.write(f"Error downloading or extracting content.zip: {e}")
# else:
# st.write("All files exist.")
check_and_download_files()
############### Load CNN Model ############
# Load the saved model
loaded_model = load_model("my_authorship_model")
# Load the saved tokenizer and label encoder
with open('tokenizer.pkl', 'rb') as handle:
tokenizer = pickle.load(handle)
with open('label_encoder.pkl', 'rb') as handle:
label_encoder = pickle.load(handle)
max_length = 300
############### End Load CNN Model ############
# Function to predict author for new text
def predict_author(new_text, model, tokenizer, label_encoder):
sequence = tokenizer.texts_to_sequences([new_text])
padded_sequence = pad_sequences(sequence, maxlen=max_length, padding='post', truncating='post')
prediction = model.predict(padded_sequence)
predicted_label = label_encoder.inverse_transform([prediction.argmax()])[0]
probabilities = prediction[0]
author_probabilities = {}
for idx, prob in enumerate(probabilities):
author = label_encoder.inverse_transform([idx])[0]
author_probabilities[author] = prob
return predicted_label, author_probabilities
new_text = st.text_area("Input Your Text Here:")
# Creates a button named 'Press me'
press_me_button = st.button("Human or Robot?")
if press_me_button:
########## ML
word_count = len(re.findall(r'\w+', new_text))
st.write(f"Words Count: {word_count}")
# Choose the appropriate model based on word count
if 10 <= word_count <= 34:
file_prefix = 'truncated_10_to_34.xlsx'
elif 35 <= word_count <= 59:
file_prefix = 'truncated_35_to_59.xlsx'
elif 60 <= word_count <= 84:
file_prefix = 'truncated_60_to_84.xlsx'
elif 85 <= word_count <= 109:
file_prefix = 'truncated_85_to_109.xlsx'
elif 110 <= word_count <= 134:
file_prefix = 'truncated_110_to_134.xlsx'
elif 135 <= word_count <= 159:
file_prefix = 'truncated_135_to_159.xlsx'
elif 160 <= word_count <= 184:
file_prefix = 'truncated_160_to_184.xlsx'
elif 185 <= word_count <= 209:
file_prefix = 'truncated_185_to_209.xlsx'
elif 210 <= word_count <= 234:
file_prefix = 'truncated_210_to_234.xlsx'
elif 235 <= word_count <= 259:
file_prefix = 'truncated_235_to_259.xlsx'
elif 260 <= word_count <= 284:
file_prefix = 'truncated_260_to_284.xlsx'
else:
file_prefix = 'not_trancated_full_paragraph.xlsx'
# Load the models and vectorizer
with open(f"{file_prefix}_ridge_model.pkl", 'rb') as file:
ridge_model = pickle.load(file)
with open(f"{file_prefix}_extra_trees_model.pkl", 'rb') as file:
extra_trees_model = pickle.load(file)
with open(f"{file_prefix}_vectorizer.pkl", 'rb') as file:
vectorizer = pickle.load(file)
# ML Vectorizing the input
user_input_transformed = vectorizer.transform([new_text])
# ML predictions
ridge_prediction = ridge_model.predict(user_input_transformed)
extra_trees_prediction = extra_trees_model.predict(user_input_transformed)
# CNN prediction + Vectorizing the input
predicted_author, author_probabilities = predict_author(new_text, loaded_model, tokenizer, label_encoder)
sorted_probabilities = sorted(author_probabilities.items(), key=lambda x: x[1], reverse=True)
author_map = {
"googlebard": "Google Bard",
"gpt3": "ChatGPT-3",
"gpt4": "ChatGPT-4",
"huggingface": "HuggingChat",
"human": "Human-Written"
}
cnn_name, ridge_name, extra_trees_name = get_author_display_name(predicted_author, ridge_prediction, extra_trees_prediction)
with st.expander("Modeling Details (Click Here)..."):
st.write(f"Ridge: {ridge_name}")
st.write(f"ExtraTree: {extra_trees_name}")
st.write(f"CNN: {cnn_name}")
for author, prob in sorted_probabilities:
display_name = author_map.get(author, author) # Retrieve the display name, fall back to original if not found
st.write(f"{display_name}: {prob * 100:.2f}%")
st.progress(float(prob))
max_cnn_prob_name = sorted_probabilities[0][0]
max_cnn_prob = float(sorted_probabilities[0][1])
if word_count < 10.0 or word_count > 1081.0:
st.warning("For better prediction input texts between 10 and 1081", icon="βΉοΈ")
elif word_count < 256:
if ridge_prediction == extra_trees_prediction == predicted_author:
st.success(f"Most likely written by: **{cnn_name}**", icon="β
")
st.info("We are quite confident in the accuracy of this result.", icon="βΉοΈ")
elif ridge_prediction == predicted_author:
st.success(f"Most likely written by: **{cnn_name}**", icon="β
")
st.success(f"2nd Most likely written by: **{extra_trees_name}**", icon="β
")
st.write("_" * 30)
elif extra_trees_prediction == predicted_author:
st.success(f"Most likely written by: **{cnn_name}**", icon="β
")
st.success(f"2nd Most likely written by: **{ridge_name}**", icon="β
")
st.write("_" * 30)
else:
st.warning("Notice 1: There is a difficulity predicting your text, it might fill into one of the below:", icon="β οΈ")
st.success(f"1- **{cnn_name}**", icon="β
")
st.success(f"2- **{ridge_name}**", icon="β
")
st.success(f"3- **{extra_trees_name}**", icon="β
")
else:
if ridge_prediction == extra_trees_prediction == predicted_author:
st.success(f"Most likely written by: **{ridge_name}**", icon="β
")
st.info("We are quite confident in the accuracy of this result.", icon="βΉοΈ")
elif ridge_prediction == predicted_author:
st.success(f"Most likely written by: **{ridge_name}**", icon="β
")
st.success(f"2nd Most likely written by: **{extra_trees_name}**", icon="β
")
st.write("_" * 30)
elif ridge_prediction == extra_trees_prediction:
st.success(f"Most likely written by: **{ridge_name}**", icon="β
")
st.success(f"2nd Most likely written by: **{cnn_name}**", icon="β
")
st.write("_" * 30)
else:
st.warning("Notice 1: There is a difficulity predicting your text, it might fill into one of the below:", icon="β οΈ")
st.success(f"1- **{ridge_name}**", icon="β
")
st.success(f"2- **{cnn_name}**", icon="β
")
st.success(f"3- **{extra_trees_name}**", icon="β
")
# Using expander to make FAQ sections
st.subheader("Frequently Asked Questions (FAQ)")
# Small Description
with st.expander("What is this project about?"):
st.write("""
This project is part of an MSc in Data Analytics at the University of Portsmouth.
Developed by Jaifar Al Shizawi, it aims to identify whether a text is written by
a human or a specific Large Language Model (LLM) like ChatGPT-3, ChatGPT-4, Google Bard, or HuggingChat.
For inquiries, contact [[email protected]](mailto:[email protected]).
Supervised by Dr. Mohamed Bader.
""")
# Aim and Objectives
with st.expander("Aim and Objectives"):
st.write("""
The project aims to help staff at the University of Portsmouth distinguish between
student-written artifacts and those generated by LLMs. It focuses on text feature extraction, model testing,
and implementing a user-friendly dashboard among other objectives.
""")
# System Details
with st.expander("How does the system work?"):
st.write("""
The system is trained using deep learning model on a dataset of 140,546 paragraphs, varying in length from 10 to 1090 words.
It achieves an accuracy of 0.9964 with a validation loss of 0.094.
""")
# Fetch the image from the URL
accuracy_image_request = requests.get("https://jaifar.net/ADS/best_accuracy.png", headers=headers)
# Save the downloaded content
image_path = "best_accuracy.png"
with open(image_path, "wb") as f:
f.write(accuracy_image_request.content)
# Open the image
accuracy_image = Image.open(image_path)
# Display the image using streamlit
st.image(accuracy_image, caption='Best Accuracy', use_column_width=True)
# Data Storage Information
with st.expander("Does the system store my data?"):
st.write("No, the system does not collect or store any user input data.")
# Use-case Limitation
with st.expander("Can I use this as evidence?"):
st.write("""
No, this system is a Proof of Concept (POC) and should not be used as evidence against students or similar entities.
""")
|