File size: 7,692 Bytes
ac163d9
91871c3
8e0cc30
 
 
 
 
 
 
 
626c58a
8e0cc30
 
 
 
 
 
 
55940d8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3b7bbcd
7ceaece
8e0cc30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91871c3
 
 
 
 
 
 
8e0cc30
 
 
 
91871c3
8e0cc30
91871c3
8e0cc30
 
 
 
 
 
7ceaece
 
8e0cc30
 
 
 
 
 
 
 
 
7ceaece
 
 
 
 
 
 
 
 
 
 
8e0cc30
7ceaece
8e0cc30
 
 
 
 
 
 
 
 
 
 
 
7ceaece
 
8e0cc30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dcb95ab
8e0cc30
 
69aa5ac
8e0cc30
 
 
 
d8667c4
69b40a6
 
 
 
 
 
 
 
d8667c4
 
 
 
 
 
8e0cc30
69b40a6
 
8fe2540
 
3b7bbcd
 
 
 
 
 
 
 
 
 
 
90f4805
 
 
3b7bbcd
90f4805
3b7bbcd
 
626c58a
 
 
24ff7ed
626c58a
 
1786f28
626c58a
23a0b81
626c58a
1786f28
 
 
 
 
 
626c58a
1786f28
626c58a
 
 
90f4805
3b7bbcd
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
import streamlit as st
import zipfile
import os
import requests
from keras.models import load_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
import pickle
import numpy as np
from PIL import Image


# Custom headers for the HTTP request
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
}

#################### Load the banner image ##########
# Fetch the image from the URL
banner_image_request = requests.get("https://jaifar.net/ADS/banner.jpg", headers=headers)

# Save the downloaded content
banner_image_path = "banner.jpg"
with open(banner_image_path, "wb") as f:
    f.write(banner_image_request.content)


# Open the image
banner_image = Image.open(banner_image_path)

# Display the image using streamlit
st.image(banner_image, caption='', use_column_width=True)

################ end loading banner image ##################


############# Download Or Check Files/folders exeistince ##############
# Check if the model folder exists
zip_file_path = "my_authorship_model_zip.zip"
if not os.path.exists('my_authorship_model'):
    try:
        # Download the model
        model_url = 'https://jaifar.net/ADS/my_authorship_model_zip.zip'
        r = requests.get(model_url, headers=headers)
        r.raise_for_status()

        # Debugging: Check if download is successful by examining content length
        st.write(f"Downloaded model size: {len(r.content)} bytes")

        # Save the downloaded content
        with open(zip_file_path, "wb") as f:
            f.write(r.content)

        # Debugging: Verify that the zip file exists
        if os.path.exists(zip_file_path):
            st.write("Zip file exists")

            # Extract the model using zipfile
            with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
                zip_ref.extractall('my_authorship_model')
                
            # Debugging: Check if the folder is successfully created
            if os.path.exists('my_authorship_model'):
                st.write("Model folder successfully extracted using zipfile")
                # Debugging: List the directory contents after extraction
                st.write("Listing directory contents:")
                st.write(os.listdir('.'))
            else:
                st.write("Model folder was not extracted successfully using zipfile")
                exit(1)

        else:
            st.write("Zip file does not exist")
            exit(1)
    except Exception as e:
        st.write(f"Failed to download or extract the model: {e}")
        exit(1)
else:
     st.write("Version: 2.1")


# Download the required files
file_urls = {
    'tokenizer.pkl': 'https://jaifar.net/ADS/tokenizer.pkl',
    'label_encoder.pkl': 'https://jaifar.net/ADS/label_encoder.pkl'
}

for filename, url in file_urls.items():
    if not os.path.exists(filename):  # Check if the file doesn't exist
        try:
            r = requests.get(url, headers=headers)
            r.raise_for_status()
            with open(filename, 'wb') as f:
                f.write(r.content)
        except Exception as e:
            st.write(f"Failed to download {filename}: {e}")
            exit(1)
    else:
        st.write(f"File {filename} already exists. Skipping download.")

############### Load CNN Model ############
# Load the saved model
loaded_model = load_model("my_authorship_model")

# Load the saved tokenizer and label encoder
with open('tokenizer.pkl', 'rb') as handle:
    tokenizer = pickle.load(handle)

with open('label_encoder.pkl', 'rb') as handle:
    label_encoder = pickle.load(handle)

max_length = 300  # As defined in the training code

############### End Load CNN Model ############

# Function to predict author for new text
def predict_author(new_text, model, tokenizer, label_encoder):
    sequence = tokenizer.texts_to_sequences([new_text])
    padded_sequence = pad_sequences(sequence, maxlen=max_length, padding='post', truncating='post')
    prediction = model.predict(padded_sequence)

    predicted_label = label_encoder.inverse_transform([prediction.argmax()])[0]
    probabilities = prediction[0]
    author_probabilities = {}
    for idx, prob in enumerate(probabilities):
        author = label_encoder.inverse_transform([idx])[0]
        author_probabilities[author] = prob

    return predicted_label, author_probabilities

new_text = st.text_area("Input Your Text Here:")

# Creates a button named 'Press me'
press_me_button = st.button("Human or Robot?")

if press_me_button:
    predicted_author, author_probabilities = predict_author(new_text, loaded_model, tokenizer, label_encoder)
    sorted_probabilities = sorted(author_probabilities.items(), key=lambda x: x[1], reverse=True)
  
    author_map = {
        "googlebard": "Google Bard",
        "gpt3": "ChatGPT-3",
        "gpt4": "ChatGPT-4",
        "huggingface": "HuggingChat",
        "human": "Human-Written"
    }

    predicted_author_diplay_name =  author_map.get(predicted_author, predicted_author)
    st.write(f"The text is most likely written by: {predicted_author_diplay_name}")
    st.write("Probabilities for each author are (sorted):")
    # Mapping the internal names to display names


    for author, prob in sorted_probabilities:
        display_name = author_map.get(author, author)  # Retrieve the display name, fall back to original if not found
        st.write(f"{display_name}: {prob * 100:.2f}%")
        st.progress(float(prob))

# Using expander to make FAQ sections
st.subheader("Frequently Asked Questions (FAQ)")

# Small Description
with st.expander("What is this project about?"):
    st.write("""
    This project is part of an MSc in Data Analytics at the University of Portsmouth.
    Developed by Jaifar Al Shizawi, it aims to identify whether a text is written by a human or a specific Large Language Model (LLM) like ChatGPT-3, ChatGPT-4, Google Bard, or HuggingChat.
    For inquiries, contact [[email protected]](mailto:[email protected]).
    Supervised by Dr. Mohamed Bader.
    """)
    
# Aim and Objectives
with st.expander("Aim and Objectives"):
    st.write("""
    The project aims to help staff at the University of Portsmouth distinguish between student-written artifacts and those generated by LLMs. It focuses on text feature extraction, model testing, and implementing a user-friendly dashboard among other objectives.
    """)

# System Details
with st.expander("How does the system work?"):
    st.write("""
    The system is trained using deep learning model on a dataset of 140,546 paragraphs, varying in length from 10 to 1090 words.
    It achieves an accuracy of 0.9964 with a validation loss of 0.094.
    """)

    # Fetch the image from the URL
    accuracy_image_request = requests.get("https://jaifar.net/ADS/best_accuracy.png", headers=headers)
    
    # Save the downloaded content
    image_path = "best_accuracy.png"
    with open(image_path, "wb") as f:
        f.write(accuracy_image_request.content)

    
    # Open the image
    accuracy_image = Image.open(image_path)
    
    # Display the image using streamlit
    st.image(accuracy_image, caption='Best Accuracy', use_column_width=True)

# Data Storage Information
with st.expander("Does the system store my data?"):
    st.write("No, the system does not collect or store any user input data.")

# Use-case Limitation
with st.expander("Can I use this as evidence?"):
    st.write("""
    No, this system is a Proof of Concept (POC) and should not be used as evidence against students or similar entities.
    """)