Spaces:
Sleeping
Sleeping
File size: 20,375 Bytes
ac163d9 91871c3 8e0cc30 5a0a98f 8e0cc30 626c58a e378435 dbe30b4 3b5315f 8e0cc30 55940d8 dbe30b4 3b7bbcd 7ceaece 8e0cc30 6651264 8e0cc30 6651264 8e0cc30 91871c3 6651264 91871c3 8e0cc30 7ceaece 99f22de 8e0cc30 7ceaece b172de1 ef4bb93 7387c1d 60e0885 1f15b23 3461ff8 1f15b23 60e0885 1f15b23 6b0535c 1f15b23 6b0535c 60e0885 d26f6c2 60e0885 1f15b23 60e0885 1f15b23 60e0885 6b0535c 60e0885 1f15b23 60e0885 6b0535c 60e0885 6b0535c 1f15b23 6b0535c 0010ffc 1f15b23 8e0cc30 7ceaece 8e0cc30 7ceaece 8e0cc30 dcb95ab a3c831e 8e0cc30 69aa5ac 8e0cc30 152158c a3c831e 152158c a3c831e 152158c fe2bd7f 152158c fe2bd7f 152158c fe2bd7f 152158c fe2bd7f 152158c fe2bd7f 152158c fe2bd7f 152158c fe2bd7f 152158c fe2bd7f 152158c fe2bd7f 152158c fe2bd7f 152158c fe2bd7f 152158c fe2bd7f 152158c 66df616 9c88d2f c724997 f2a6bd1 e378435 152158c 8e0cc30 a3c831e 69b40a6 dbe30b4 69b40a6 dbe30b4 a3c831e 66df616 0ac2f46 3b5315f 0ac2f46 dbe30b4 71b7c7a dbe30b4 59ebdc7 dbe30b4 a3c831e 71b7c7a dbe30b4 59ebdc7 dbe30b4 a3c831e dbe30b4 a3c831e dbe30b4 a3c831e dbe30b4 66df616 299488e 91a8314 3b5315f dbe30b4 66df616 299488e 3b5315f dbe30b4 66df616 299488e 3b5315f 299488e dbe30b4 66df616 299488e 3b5315f 0ac2f46 dbe30b4 299488e 66df616 299488e 3b5315f 299488e a3c831e 8fe2540 3b7bbcd 90f4805 3b7bbcd 90f4805 3b7bbcd 626c58a 24ff7ed 626c58a 1786f28 626c58a 23a0b81 626c58a 1786f28 626c58a 1786f28 626c58a 90f4805 3b7bbcd b99200a a295ebd 3b7bbcd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 |
import streamlit as st
import zipfile
import os
import requests
import re
from keras.models import load_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
import pickle
import numpy as np
from PIL import Image
from joblib import load
import math
from streamlit_extras.let_it_rain import rain
# Custom headers for the HTTP request
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
}
#################### Load the banner image ##########
# Fetch the image from the URL
banner_image_request = requests.get("https://jaifar.net/ADS/banner.jpg", headers=headers)
# Save the downloaded content
banner_image_path = "banner.jpg"
with open(banner_image_path, "wb") as f:
f.write(banner_image_request.content)
# Open the image
banner_image = Image.open(banner_image_path)
# Display the image using streamlit
st.image(banner_image, caption='', use_column_width=True)
################ end loading banner image ##################
def get_author_display_name(predicted_author, ridge_prediction, extra_trees_prediction):
author_map = {
"googlebard": "Google Bard",
"gpt3": "ChatGPT-3",
"gpt4": "ChatGPT-4",
"huggingface": "HuggingChat",
"human": "Human-Written"
}
cnn_predicted_author_display_name = author_map.get(predicted_author, predicted_author)
ridge_predicted_author_display_name = author_map.get(ridge_prediction[0], ridge_prediction[0])
extra_trees_predicted_author_display_name = author_map.get(extra_trees_prediction[0], extra_trees_prediction[0])
return cnn_predicted_author_display_name, ridge_predicted_author_display_name, extra_trees_predicted_author_display_name
############# Download Or Check Files/folders exeistince ##############
# Check if the model folder exists
zip_file_path = "my_authorship_model_zip.zip"
if not os.path.exists('my_authorship_model'):
try:
# Download the model
model_url = 'https://jaifar.net/ADS/my_authorship_model_zip.zip'
r = requests.get(model_url, headers=headers)
r.raise_for_status()
# Debugging: Check if download is successful by examining content length
# st.write(f"Downloaded model size: {len(r.content)} bytes")
# Save the downloaded content
with open(zip_file_path, "wb") as f:
f.write(r.content)
# Debugging: Verify that the zip file exists
if os.path.exists(zip_file_path):
# st.write("Zip file exists")
# Extract the model using zipfile
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
zip_ref.extractall('my_authorship_model')
# # Debugging: Check if the folder is successfully created
# if os.path.exists('my_authorship_model'):
# # st.write("Model folder successfully extracted using zipfile")
# # Debugging: List the directory contents after extraction
# # st.write("Listing directory contents:")
# # st.write(os.listdir('.'))
# else:
# st.write("Model folder was not extracted successfully using zipfile")
# exit(1)
else:
st.write("Zip file does not exist")
exit(1)
except Exception as e:
st.write(f"Failed to download or extract the model: {e}")
exit(1)
else:
st.write("Version: 0.99")
# Download the required files
file_urls = {
'tokenizer.pkl': 'https://jaifar.net/ADS/tokenizer.pkl',
'label_encoder.pkl': 'https://jaifar.net/ADS/label_encoder.pkl'
}
for filename, url in file_urls.items():
if not os.path.exists(filename): # Check if the file doesn't exist
try:
r = requests.get(url, headers=headers)
r.raise_for_status()
with open(filename, 'wb') as f:
f.write(r.content)
except Exception as e:
st.write(f"Failed to download {filename}: {e}")
exit(1)
# else:
# st.write(f"File {filename} already exists. Skipping download.")
############ download ridge and ExtraTree stuff
# def has_internet_connection():
# try:
# response = requests.get("https://www.google.com/", timeout=5)
# return True
# except requests.ConnectionError:
# return False
def is_zip_file(file_path):
return zipfile.is_zipfile(file_path)
def are_files_extracted(extracted_files, missing_files):
for file in missing_files:
if file not in extracted_files:
return False
return True
def check_and_download_files():
file_names = [
"truncated_260_to_284.xlsx_vectorizer.pkl",
"not_trancated_full_paragraph.xlsx_extra_trees_model.pkl",
"not_trancated_full_paragraph.xlsx_ridge_model.pkl",
"not_trancated_full_paragraph.xlsx_vectorizer.pkl",
"truncated_10_to_34.xlsx_extra_trees_model.pkl",
"truncated_10_to_34.xlsx_ridge_model.pkl",
"truncated_10_to_34.xlsx_vectorizer.pkl",
"truncated_35_to_59.xlsx_extra_trees_model.pkl",
"truncated_35_to_59.xlsx_ridge_model.pkl",
"truncated_35_to_59.xlsx_vectorizer.pkl",
"truncated_60_to_84.xlsx_extra_trees_model.pkl",
"truncated_60_to_84.xlsx_ridge_model.pkl",
"truncated_60_to_84.xlsx_vectorizer.pkl",
"truncated_85_to_109.xlsx_extra_trees_model.pkl",
"truncated_85_to_109.xlsx_ridge_model.pkl",
"truncated_85_to_109.xlsx_vectorizer.pkl",
"truncated_110_to_134.xlsx_extra_trees_model.pkl",
"truncated_110_to_134.xlsx_ridge_model.pkl",
"truncated_110_to_134.xlsx_vectorizer.pkl",
"truncated_135_to_159.xlsx_extra_trees_model.pkl",
"truncated_135_to_159.xlsx_ridge_model.pkl",
"truncated_135_to_159.xlsx_vectorizer.pkl",
"truncated_160_to_184.xlsx_extra_trees_model.pkl",
"truncated_160_to_184.xlsx_ridge_model.pkl",
"truncated_160_to_184.xlsx_vectorizer.pkl",
"truncated_185_to_209.xlsx_extra_trees_model.pkl",
"truncated_185_to_209.xlsx_ridge_model.pkl",
"truncated_185_to_209.xlsx_vectorizer.pkl",
"truncated_210_to_234.xlsx_extra_trees_model.pkl",
"truncated_210_to_234.xlsx_ridge_model.pkl",
"truncated_210_to_234.xlsx_vectorizer.pkl",
"truncated_235_to_259.xlsx_extra_trees_model.pkl",
"truncated_235_to_259.xlsx_ridge_model.pkl",
"truncated_235_to_259.xlsx_vectorizer.pkl",
"truncated_260_to_284.xlsx_extra_trees_model.pkl",
"truncated_260_to_284.xlsx_ridge_model.pkl"
]
missing_files = []
for file_name in file_names:
if not os.path.exists(file_name):
missing_files.append(file_name)
if missing_files:
st.write("The following files are missing:")
for file_name in missing_files:
st.write(file_name)
# if not has_internet_connection():
# st.write("No internet connection. Cannot download missing files.")
# return
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
}
url = 'https://jaifar.net/ADS/content.zip'
response = requests.get(url, headers=headers)
response.raise_for_status()
with open('content.zip', 'wb') as zip_file:
zip_file.write(response.content)
if not is_zip_file('content.zip'):
st.write("Downloaded content is not a ZIP file.")
return
with zipfile.ZipFile('content.zip', 'r') as zip_ref:
zip_ref.extractall()
extracted_files = os.listdir()
if not are_files_extracted(extracted_files, missing_files):
st.write("Not all missing files were extracted.")
return
st.write("content.zip downloaded and extracted successfully.")
except Exception as e:
st.write(f"Error downloading or extracting content.zip: {e}")
# else:
# st.write("All files exist.")
check_and_download_files()
############### Load CNN Model ############
# Load the saved model
loaded_model = load_model("my_authorship_model")
# Load the saved tokenizer and label encoder
with open('tokenizer.pkl', 'rb') as handle:
tokenizer = pickle.load(handle)
with open('label_encoder.pkl', 'rb') as handle:
label_encoder = pickle.load(handle)
max_length = 300 # As defined in the training code
############### End Load CNN Model ############
# Function to predict author for new text
def predict_author(new_text, model, tokenizer, label_encoder):
sequence = tokenizer.texts_to_sequences([new_text])
padded_sequence = pad_sequences(sequence, maxlen=max_length, padding='post', truncating='post')
prediction = model.predict(padded_sequence)
predicted_label = label_encoder.inverse_transform([prediction.argmax()])[0]
probabilities = prediction[0]
author_probabilities = {}
for idx, prob in enumerate(probabilities):
author = label_encoder.inverse_transform([idx])[0]
author_probabilities[author] = prob
return predicted_label, author_probabilities
new_text = st.text_area("Input Your Text Here:")
# Creates a button named 'Press me'
press_me_button = st.button("Human or Robot?")
if press_me_button:
########## ML
word_count = len(re.findall(r'\w+', new_text))
st.write(f"Words Count: {word_count}")
# Choose the appropriate model based on word count
if 10 <= word_count <= 34:
file_prefix = 'truncated_10_to_34.xlsx'
elif 35 <= word_count <= 59:
file_prefix = 'truncated_35_to_59.xlsx'
elif 60 <= word_count <= 84:
file_prefix = 'truncated_60_to_84.xlsx'
elif 85 <= word_count <= 109:
file_prefix = 'truncated_85_to_109.xlsx'
elif 110 <= word_count <= 134:
file_prefix = 'truncated_110_to_134.xlsx'
elif 135 <= word_count <= 159:
file_prefix = 'truncated_135_to_159.xlsx'
elif 160 <= word_count <= 184:
file_prefix = 'truncated_160_to_184.xlsx'
elif 185 <= word_count <= 209:
file_prefix = 'truncated_185_to_209.xlsx'
elif 210 <= word_count <= 234:
file_prefix = 'truncated_210_to_234.xlsx'
elif 235 <= word_count <= 259:
file_prefix = 'truncated_235_to_259.xlsx'
elif 260 <= word_count <= 284:
file_prefix = 'truncated_260_to_284.xlsx'
else:
file_prefix = 'not_trancated_full_paragraph.xlsx'
# Load the models and vectorizer
with open(f"{file_prefix}_ridge_model.pkl", 'rb') as file:
ridge_model = pickle.load(file)
with open(f"{file_prefix}_extra_trees_model.pkl", 'rb') as file:
extra_trees_model = pickle.load(file)
with open(f"{file_prefix}_vectorizer.pkl", 'rb') as file:
vectorizer = pickle.load(file)
# Transform the input
user_input_transformed = vectorizer.transform([new_text])
# Make predictions
ridge_prediction = ridge_model.predict(user_input_transformed)
extra_trees_prediction = extra_trees_model.predict(user_input_transformed)
########## DL
predicted_author, author_probabilities = predict_author(new_text, loaded_model, tokenizer, label_encoder)
sorted_probabilities = sorted(author_probabilities.items(), key=lambda x: x[1], reverse=True)
author_map = {
"googlebard": "Google Bard",
"gpt3": "ChatGPT-3",
"gpt4": "ChatGPT-4",
"huggingface": "HuggingChat",
"human": "Human-Written"
}
# cnn_name = author_map.get(predicted_author, predicted_author)
# ridge_name = author_map.get(ridge_prediction[0], ridge_prediction[0])
# extra_trees_name = author_map.get(extra_trees_prediction[0], extra_trees_prediction[0])
cnn_name, ridge_name, extra_trees_name = get_author_display_name(predicted_author, ridge_prediction, extra_trees_prediction)
if ridge_prediction == extra_trees_prediction == predicted_author:
st.success(f"Most likely written by: **{ridge_name}**", icon="β
")
st.info("We are quite confident in the accuracy of this result.", icon="βΉοΈ")
rain(
emoji="π",
font_size=54,
falling_speed=5,
animation_length="infinite",
)
else:
# Repeat the text with a space at the end of each iteration
# Load proper pre-trained for full texts
file_prefix = 'not_trancated_full_paragraph.xlsx'
with open(f"{file_prefix}_ridge_model.pkl", 'rb') as file:
ridge_model = pickle.load(file)
with open(f"{file_prefix}_extra_trees_model.pkl", 'rb') as file:
extra_trees_model = pickle.load(file)
with open(f"{file_prefix}_vectorizer.pkl", 'rb') as file:
vectorizer = pickle.load(file)
repeated_text = ""
max_word_count = 500
amplify = 1
if word_count >= max_word_count:
amplify = 2
else:
amplify = math.ceil(max_word_count / word_count)
for _ in range(amplify):
repeated_text += new_text + " "
new_text = repeated_text
word_count = len(re.findall(r'\w+', new_text))
## Repeat ML
# Transform the input
user_input_transformed = vectorizer.transform([new_text])
# Make predictions
ridge_prediction = ridge_model.predict(user_input_transformed)
extra_trees_prediction = extra_trees_model.predict(user_input_transformed)
### Repeat DL
predicted_author, author_probabilities = predict_author(new_text, loaded_model, tokenizer, label_encoder)
sorted_probabilities = sorted(author_probabilities.items(), key=lambda x: x[1], reverse=True)
# Get disply name
cnn_name, ridge_name, extra_trees_name = get_author_display_name(predicted_author, ridge_prediction, extra_trees_prediction)
if ridge_prediction == extra_trees_prediction == predicted_author:
st.success(f"Most likely written by: **{ridge_name}**", icon="β
")
st.warning(f"**Notice:** Your input text has been magnified {amplify} times to better capture its characteristics and patterns.", icon="β οΈ")
st.write("_" * 30)
rain(
emoji="π",
font_size=54,
falling_speed=5,
animation_length="infinite",
)
elif ridge_prediction == extra_trees_prediction:
st.success(f"Most likely written by: **{ridge_name}**", icon="β
")
st.success(f"2nd Most likely written by: **{cnn_name}**", icon="β
")
st.warning(f"**Notice:** The input text has been magnified {amplify} times to better capture its characteristics and patterns.", icon="β οΈ")
st.write("_" * 30)
rain(
emoji="π",
font_size=54,
falling_speed=5,
animation_length="infinite",
)
elif extra_trees_prediction == predicted_author:
st.success(f"Most likely written by: **{extra_trees_name}**", icon="β
")
st.success(f"2nd Most likely written by: **{ridge_name}**", icon="β
")
st.warning(f"**Notice:** The input text has been magnified {amplify} times to better capture its characteristics and patterns.", icon="β οΈ")
st.write("_" * 30)
rain(
emoji="π",
font_size=54,
falling_speed=5,
animation_length="infinite",
)
elif ridge_prediction == predicted_author:
st.success(f"Most likely written by: **{ridge_name}**", icon="β
")
st.success(f"2nd Most likely written by: **{extra_trees_name}**", icon="β
")
st.warning(f"**Notice:** The input text has been magnified {amplify} times to better capture its characteristics and patterns.", icon="β οΈ")
st.write("_" * 30)
rain(
emoji="π",
font_size=54,
falling_speed=5,
animation_length="infinite",
)
else:
st.warning("Notice 1: There is a difficulity predicting your text, it might fill into one of the below:", icon="β οΈ")
st.success(f"1- **{ridge_name}**", icon="β
")
st.success(f"2- **{cnn_name}**", icon="β
")
st.success(f"3- **{extra_trees_name}**", icon="β
")
st.warning(f"**Notice 2:** The input text has been magnified {amplify} times to better capture its characteristics and patterns.", icon="β οΈ")
st.write("_" * 30)
rain(
emoji="π",
font_size=54,
falling_speed=5,
animation_length="infinite",
)
# with st.expander("What is this project about?"):
# st.write("""
# This project is part of an MSc in Data Analytics at the University of Portsmouth.
# Developed by Jaifar Al Shizawi, it aims to identify whether a text is written by a human or a specific Large Language Model (LLM) like ChatGPT-3, ChatGPT-4, Google Bard, or HuggingChat.
# For inquiries, contact [[email protected]](mailto:[email protected]).
# Supervised by Dr. Mohamed Bader.
# """)
# for author, prob in sorted_probabilities:
# display_name = author_map.get(author, author) # Retrieve the display name, fall back to original if not found
# st.write(f"{display_name}: {prob * 100:.2f}%")
# st.progress(float(prob))
# Using expander to make FAQ sections
st.subheader("Frequently Asked Questions (FAQ)")
# Small Description
with st.expander("What is this project about?"):
st.write("""
This project is part of an MSc in Data Analytics at the University of Portsmouth.
Developed by Jaifar Al Shizawi, it aims to identify whether a text is written by a human or a specific Large Language Model (LLM) like ChatGPT-3, ChatGPT-4, Google Bard, or HuggingChat.
For inquiries, contact [[email protected]](mailto:[email protected]).
Supervised by Dr. Mohamed Bader.
""")
# Aim and Objectives
with st.expander("Aim and Objectives"):
st.write("""
The project aims to help staff at the University of Portsmouth distinguish between student-written artifacts and those generated by LLMs. It focuses on text feature extraction, model testing, and implementing a user-friendly dashboard among other objectives.
""")
# System Details
with st.expander("How does the system work?"):
st.write("""
The system is trained using deep learning model on a dataset of 140,546 paragraphs, varying in length from 10 to 1090 words.
It achieves an accuracy of 0.9964 with a validation loss of 0.094.
""")
# Fetch the image from the URL
accuracy_image_request = requests.get("https://jaifar.net/ADS/best_accuracy.png", headers=headers)
# Save the downloaded content
image_path = "best_accuracy.png"
with open(image_path, "wb") as f:
f.write(accuracy_image_request.content)
# Open the image
accuracy_image = Image.open(image_path)
# Display the image using streamlit
st.image(accuracy_image, caption='Best Accuracy', use_column_width=True)
# Data Storage Information
with st.expander("Does the system store my data?"):
st.write("No, the system does not collect or store any user input data.")
# Use-case Limitation
with st.expander("Can I use this as evidence?"):
st.write("""
No, this system is a Proof of Concept (POC) and should not be used as evidence against students or similar entities.
""")
# # Creates a button named 'Press me'
# list_dir = st.button("list")
# if list_dir:
# st.write("Listing directory contents:")
# st.write(os.listdir('.'))
|