Spaces:
Running
Running
File size: 6,158 Bytes
ac163d9 91871c3 8e0cc30 626c58a 8e0cc30 3b7bbcd 8e0cc30 91871c3 8e0cc30 91871c3 8e0cc30 91871c3 8e0cc30 90f4805 8e0cc30 3b7bbcd 8e0cc30 3b7bbcd 90f4805 3b7bbcd 90f4805 3b7bbcd 626c58a 90f4805 3b7bbcd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 |
import streamlit as st
import zipfile
import os
import requests
from keras.models import load_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
import pickle
import numpy as np
from PIL import Image
# Custom headers for the HTTP request
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
}
# Check if the model folder exists
zip_file_path = "my_authorship_model_zip.zip"
if not os.path.exists('my_authorship_model'):
try:
# Download the model
model_url = 'https://jaifar.net/ADS/my_authorship_model_zip.zip'
r = requests.get(model_url, headers=headers)
r.raise_for_status()
# Debugging: Check if download is successful by examining content length
st.write(f"Downloaded model size: {len(r.content)} bytes")
# Save the downloaded content
with open(zip_file_path, "wb") as f:
f.write(r.content)
# Debugging: Verify that the zip file exists
if os.path.exists(zip_file_path):
st.write("Zip file exists")
# Extract the model using zipfile
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
zip_ref.extractall('my_authorship_model')
# Debugging: Check if the folder is successfully created
if os.path.exists('my_authorship_model'):
st.write("Model folder successfully extracted using zipfile")
# Debugging: List the directory contents after extraction
st.write("Listing directory contents:")
st.write(os.listdir('.'))
else:
st.write("Model folder was not extracted successfully using zipfile")
exit(1)
else:
st.write("Zip file does not exist")
exit(1)
except Exception as e:
st.write(f"Failed to download or extract the model: {e}")
exit(1)
# else:
# st.write("System Ready !!")
# Download the required files
file_urls = {
'tokenizer.pkl': 'https://jaifar.net/ADS/tokenizer.pkl',
'label_encoder.pkl': 'https://jaifar.net/ADS/label_encoder.pkl'
}
for filename, url in file_urls.items():
try:
r = requests.get(url, headers=headers)
r.raise_for_status()
with open(filename, 'wb') as f:
f.write(r.content)
except Exception as e:
st.write(f"Failed to download {filename}: {e}")
exit(1)
# Load the saved model
loaded_model = load_model("my_authorship_model")
# Load the saved tokenizer and label encoder
with open('tokenizer.pkl', 'rb') as handle:
tokenizer = pickle.load(handle)
with open('label_encoder.pkl', 'rb') as handle:
label_encoder = pickle.load(handle)
max_length = 300 # As defined in the training code
# Function to predict author for new text
def predict_author(new_text, model, tokenizer, label_encoder):
sequence = tokenizer.texts_to_sequences([new_text])
padded_sequence = pad_sequences(sequence, maxlen=max_length, padding='post', truncating='post')
prediction = model.predict(padded_sequence)
predicted_label = label_encoder.inverse_transform([prediction.argmax()])[0]
probabilities = prediction[0]
author_probabilities = {}
for idx, prob in enumerate(probabilities):
author = label_encoder.inverse_transform([idx])[0]
author_probabilities[author] = prob
return predicted_label, author_probabilities
new_text = st.text_area("Input your text here")
# Creates a button named 'Press me'
press_me_button = st.button("Writer or Robot?")
if press_me_button:
predicted_author, author_probabilities = predict_author(new_text, loaded_model, tokenizer, label_encoder)
sorted_probabilities = sorted(author_probabilities.items(), key=lambda x: x[1], reverse=True)
st.write(f"The text is most likely written by: {predicted_author}")
st.write("Probabilities for each author are (sorted):")
for author, prob in sorted_probabilities:
st.write(f"{author}: {prob * 100:.2f}%")
import streamlit as st
# Using expander to make FAQ sections
st.subheader("Frequently Asked Questions (FAQ)")
# Small Description
with st.expander("What is this project about?"):
st.write("""
This project is part of an MSc in Data Analytics at the University of Portsmouth.
Developed by Jaifar Al Shizawi, it aims to identify whether a text is written by a human or a specific Large Language Model (LLM) like ChatGPT-3, ChatGPT-4, Google Bard, or HuggingChat.
For inquiries, contact [[email protected]](mailto:[email protected]).
Supervised by Dr. Mohamed Bader.
""")
# Aim and Objectives
with st.expander("Aim and Objectives"):
st.write("""
The project aims to help staff at the University of Portsmouth distinguish between student-written artifacts and those generated by LLMs. It focuses on text feature extraction, model testing, and implementing a user-friendly dashboard among other objectives.
""")
# System Details
with st.expander("How does the system work?"):
st.write("""
The system is trained using a CNN model on a dataset of 140,546 paragraphs, varying in length from 10 to 500 words.
It achieves an accuracy of 0.9964 with a validation loss of 0.094.
""")
# Fetch the image from the URL
accuracy_image_request = requests.get("https://jaifar.net/best_accuracy.png")
# Open the image
accuracy_image = Image.open(accuracy_image_request.content)
# Display the image using streamlit
st.image(accuracy_image, caption='Best Accuracy', use_column_width=True)
# Data Storage Information
with st.expander("Does the system store my data?"):
st.write("No, the system does not collect or store any user input data.")
# Use-case Limitation
with st.expander("Can I use this as evidence?"):
st.write("""
No, this system is a Proof of Concept (POC) and should not be used as evidence against students or similar entities.
""")
|