Spaces:
Running
Running
File size: 7,692 Bytes
ac163d9 91871c3 8e0cc30 626c58a 8e0cc30 55940d8 3b7bbcd 7ceaece 8e0cc30 91871c3 8e0cc30 91871c3 8e0cc30 91871c3 8e0cc30 7ceaece 8e0cc30 7ceaece 8e0cc30 7ceaece 8e0cc30 7ceaece 8e0cc30 dcb95ab 8e0cc30 69aa5ac 8e0cc30 d8667c4 69b40a6 d8667c4 8e0cc30 69b40a6 8fe2540 3b7bbcd 90f4805 3b7bbcd 90f4805 3b7bbcd 626c58a 24ff7ed 626c58a 1786f28 626c58a 23a0b81 626c58a 1786f28 626c58a 1786f28 626c58a 90f4805 3b7bbcd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 |
import streamlit as st
import zipfile
import os
import requests
from keras.models import load_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
import pickle
import numpy as np
from PIL import Image
# Custom headers for the HTTP request
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
}
#################### Load the banner image ##########
# Fetch the image from the URL
banner_image_request = requests.get("https://jaifar.net/ADS/banner.jpg", headers=headers)
# Save the downloaded content
banner_image_path = "banner.jpg"
with open(banner_image_path, "wb") as f:
f.write(banner_image_request.content)
# Open the image
banner_image = Image.open(banner_image_path)
# Display the image using streamlit
st.image(banner_image, caption='', use_column_width=True)
################ end loading banner image ##################
############# Download Or Check Files/folders exeistince ##############
# Check if the model folder exists
zip_file_path = "my_authorship_model_zip.zip"
if not os.path.exists('my_authorship_model'):
try:
# Download the model
model_url = 'https://jaifar.net/ADS/my_authorship_model_zip.zip'
r = requests.get(model_url, headers=headers)
r.raise_for_status()
# Debugging: Check if download is successful by examining content length
st.write(f"Downloaded model size: {len(r.content)} bytes")
# Save the downloaded content
with open(zip_file_path, "wb") as f:
f.write(r.content)
# Debugging: Verify that the zip file exists
if os.path.exists(zip_file_path):
st.write("Zip file exists")
# Extract the model using zipfile
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
zip_ref.extractall('my_authorship_model')
# Debugging: Check if the folder is successfully created
if os.path.exists('my_authorship_model'):
st.write("Model folder successfully extracted using zipfile")
# Debugging: List the directory contents after extraction
st.write("Listing directory contents:")
st.write(os.listdir('.'))
else:
st.write("Model folder was not extracted successfully using zipfile")
exit(1)
else:
st.write("Zip file does not exist")
exit(1)
except Exception as e:
st.write(f"Failed to download or extract the model: {e}")
exit(1)
else:
st.write("Version: 2.1")
# Download the required files
file_urls = {
'tokenizer.pkl': 'https://jaifar.net/ADS/tokenizer.pkl',
'label_encoder.pkl': 'https://jaifar.net/ADS/label_encoder.pkl'
}
for filename, url in file_urls.items():
if not os.path.exists(filename): # Check if the file doesn't exist
try:
r = requests.get(url, headers=headers)
r.raise_for_status()
with open(filename, 'wb') as f:
f.write(r.content)
except Exception as e:
st.write(f"Failed to download {filename}: {e}")
exit(1)
else:
st.write(f"File {filename} already exists. Skipping download.")
############### Load CNN Model ############
# Load the saved model
loaded_model = load_model("my_authorship_model")
# Load the saved tokenizer and label encoder
with open('tokenizer.pkl', 'rb') as handle:
tokenizer = pickle.load(handle)
with open('label_encoder.pkl', 'rb') as handle:
label_encoder = pickle.load(handle)
max_length = 300 # As defined in the training code
############### End Load CNN Model ############
# Function to predict author for new text
def predict_author(new_text, model, tokenizer, label_encoder):
sequence = tokenizer.texts_to_sequences([new_text])
padded_sequence = pad_sequences(sequence, maxlen=max_length, padding='post', truncating='post')
prediction = model.predict(padded_sequence)
predicted_label = label_encoder.inverse_transform([prediction.argmax()])[0]
probabilities = prediction[0]
author_probabilities = {}
for idx, prob in enumerate(probabilities):
author = label_encoder.inverse_transform([idx])[0]
author_probabilities[author] = prob
return predicted_label, author_probabilities
new_text = st.text_area("Input Your Text Here:")
# Creates a button named 'Press me'
press_me_button = st.button("Human or Robot?")
if press_me_button:
predicted_author, author_probabilities = predict_author(new_text, loaded_model, tokenizer, label_encoder)
sorted_probabilities = sorted(author_probabilities.items(), key=lambda x: x[1], reverse=True)
author_map = {
"googlebard": "Google Bard",
"gpt3": "ChatGPT-3",
"gpt4": "ChatGPT-4",
"huggingface": "HuggingChat",
"human": "Human-Written"
}
predicted_author_diplay_name = author_map.get(predicted_author, predicted_author)
st.write(f"The text is most likely written by: {predicted_author_diplay_name}")
st.write("Probabilities for each author are (sorted):")
# Mapping the internal names to display names
for author, prob in sorted_probabilities:
display_name = author_map.get(author, author) # Retrieve the display name, fall back to original if not found
st.write(f"{display_name}: {prob * 100:.2f}%")
st.progress(float(prob))
# Using expander to make FAQ sections
st.subheader("Frequently Asked Questions (FAQ)")
# Small Description
with st.expander("What is this project about?"):
st.write("""
This project is part of an MSc in Data Analytics at the University of Portsmouth.
Developed by Jaifar Al Shizawi, it aims to identify whether a text is written by a human or a specific Large Language Model (LLM) like ChatGPT-3, ChatGPT-4, Google Bard, or HuggingChat.
For inquiries, contact [[email protected]](mailto:[email protected]).
Supervised by Dr. Mohamed Bader.
""")
# Aim and Objectives
with st.expander("Aim and Objectives"):
st.write("""
The project aims to help staff at the University of Portsmouth distinguish between student-written artifacts and those generated by LLMs. It focuses on text feature extraction, model testing, and implementing a user-friendly dashboard among other objectives.
""")
# System Details
with st.expander("How does the system work?"):
st.write("""
The system is trained using deep learning model on a dataset of 140,546 paragraphs, varying in length from 10 to 1090 words.
It achieves an accuracy of 0.9964 with a validation loss of 0.094.
""")
# Fetch the image from the URL
accuracy_image_request = requests.get("https://jaifar.net/ADS/best_accuracy.png", headers=headers)
# Save the downloaded content
image_path = "best_accuracy.png"
with open(image_path, "wb") as f:
f.write(accuracy_image_request.content)
# Open the image
accuracy_image = Image.open(image_path)
# Display the image using streamlit
st.image(accuracy_image, caption='Best Accuracy', use_column_width=True)
# Data Storage Information
with st.expander("Does the system store my data?"):
st.write("No, the system does not collect or store any user input data.")
# Use-case Limitation
with st.expander("Can I use this as evidence?"):
st.write("""
No, this system is a Proof of Concept (POC) and should not be used as evidence against students or similar entities.
""")
|