Spaces:
Running
Running
jaifar530
commited on
Big update
Browse files
app.py
CHANGED
@@ -1,11 +1,4 @@
|
|
1 |
import streamlit as st
|
2 |
-
|
3 |
-
#title
|
4 |
-
st.title("Smart Detection System of AI-Generated Text Models")
|
5 |
-
|
6 |
-
#subtitle
|
7 |
-
st.markdown("This is a POC for Smart Detection System of AI Generated Text Models project (:blue[MSc Data Analytics]), it is a pre-trained model that detect the probablities of using any of the known LLM (chatgpt3, chatgpt4, GoogleBard, HuggingfaceChat)")
|
8 |
-
|
9 |
import os
|
10 |
import requests
|
11 |
import pickle
|
@@ -19,6 +12,7 @@ import numpy as np
|
|
19 |
from nltk.stem import WordNetLemmatizer
|
20 |
from nltk import ne_chunk, pos_tag, word_tokenize
|
21 |
from nltk.tree import Tree
|
|
|
22 |
nltk.download('wordnet')
|
23 |
nltk.download('maxent_ne_chunker')
|
24 |
nltk.download('words')
|
@@ -28,6 +22,27 @@ nltk.download('punkt')
|
|
28 |
nltk.download('stopwords')
|
29 |
nltk.download('averaged_perceptron_tagger')
|
30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
# Check if the file exists
|
32 |
if not os.path.isfile('RandomForestClassifier.pkl'):
|
33 |
# Download the zip file if it doesn't exist
|
@@ -42,15 +57,20 @@ if not os.path.isfile('RandomForestClassifier.pkl'):
|
|
42 |
with open('RandomForestClassifier.pkl', 'wb') as file:
|
43 |
file.write(response.content)
|
44 |
|
45 |
-
# At this point, the pickle file should exist, either it was already there, or it has been downloaded and extracted.
|
46 |
-
with open('RandomForestClassifier.pkl', 'rb') as file:
|
47 |
-
clf_loaded = pickle.load(file)
|
48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
|
|
54 |
|
55 |
|
56 |
|
@@ -68,7 +88,8 @@ num_words = 500
|
|
68 |
input_paragraph = ' '.join(word_tokenize(input_paragraph)[:num_words])
|
69 |
|
70 |
# Extracting features
|
71 |
-
def
|
|
|
72 |
words = word_tokenize(text)
|
73 |
sentences = sent_tokenize(text)
|
74 |
|
@@ -93,13 +114,18 @@ def extract_features(text):
|
|
93 |
'pos_WDT': pos_counts['WDT'],
|
94 |
'pos_TO': pos_counts['TO'],
|
95 |
'pos_VB': pos_counts['VB'],
|
|
|
|
|
96 |
'pos_VBG': pos_counts['VBG'],
|
97 |
'pos_.': pos_counts['.'],
|
98 |
'pos_JJ': pos_counts['JJ'],
|
99 |
'pos_NNS': pos_counts['NNS'],
|
100 |
'pos_RB': pos_counts['RB'],
|
|
|
101 |
'pos_CC': pos_counts['CC'],
|
|
|
102 |
'pos_VBN': pos_counts['VBN'],
|
|
|
103 |
}
|
104 |
|
105 |
features = {
|
@@ -110,18 +136,35 @@ def extract_features(text):
|
|
110 |
'lemma_count': lemma_count,
|
111 |
'named_entity_count': named_entity_count,
|
112 |
}
|
113 |
-
features.update(pos_features)
|
|
|
114 |
|
115 |
return pd.Series(features)
|
116 |
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
122 |
input_features = df['paragraph'].apply(extract_features)
|
|
|
123 |
predicted_llm = clf_loaded.predict(input_features)
|
124 |
-
|
|
|
125 |
|
126 |
predicted_proba = clf_loaded.predict_proba(input_features)
|
127 |
probabilities = predicted_proba[0]
|
@@ -146,4 +189,49 @@ if press_me_button:
|
|
146 |
for llm, prob in prob_dict.items():
|
147 |
st.write(llm + ': ' + prob)
|
148 |
st.progress(float(prob.strip('%'))/100)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
149 |
|
|
|
1 |
import streamlit as st
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import os
|
3 |
import requests
|
4 |
import pickle
|
|
|
12 |
from nltk.stem import WordNetLemmatizer
|
13 |
from nltk import ne_chunk, pos_tag, word_tokenize
|
14 |
from nltk.tree import Tree
|
15 |
+
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
|
16 |
nltk.download('wordnet')
|
17 |
nltk.download('maxent_ne_chunker')
|
18 |
nltk.download('words')
|
|
|
22 |
nltk.download('stopwords')
|
23 |
nltk.download('averaged_perceptron_tagger')
|
24 |
|
25 |
+
|
26 |
+
#title
|
27 |
+
st.title("Smart Detection System of AI-Generated Text Models")
|
28 |
+
|
29 |
+
#subtitle
|
30 |
+
st.markdown("This is a POC for Smart Detection System of AI Generated Text Models project (:blue[MSc Data Analytics]), it is a pre-trained model that detect the probablities of using any of the known LLM (chatgpt3, chatgpt4, GoogleBard, HuggingfaceChat)")
|
31 |
+
|
32 |
+
#input text
|
33 |
+
input_paragraph = st.text_area("Input your text here")
|
34 |
+
words_counts = word_tokenize(input_paragraph)
|
35 |
+
final_words = len(words_counts)
|
36 |
+
st.write('Words counts: ', final_words)
|
37 |
+
|
38 |
+
# Define your options
|
39 |
+
options = ["AI vs AI - RandomForest - 88 Samples", "AI vs AI - Ridge - 2000 Samples", "AI vs Human"]
|
40 |
+
|
41 |
+
# Create a dropdown menu with "Option 2" as the default
|
42 |
+
# selected_option = st.selectbox('Select an Option', options, index=1)
|
43 |
+
selected_option = st.selectbox('Select an Option', options)
|
44 |
+
|
45 |
+
|
46 |
# Check if the file exists
|
47 |
if not os.path.isfile('RandomForestClassifier.pkl'):
|
48 |
# Download the zip file if it doesn't exist
|
|
|
57 |
with open('RandomForestClassifier.pkl', 'wb') as file:
|
58 |
file.write(response.content)
|
59 |
|
|
|
|
|
|
|
60 |
|
61 |
+
# Check if the file exists
|
62 |
+
if not os.path.isfile('AI_vs_AI_Ridge_2000_Samples.pkl'):
|
63 |
+
# Download the zip file if it doesn't exist
|
64 |
+
url = 'https://jaifar.net/AI_vs_AI_Ridge_2000_Samples.pkl'
|
65 |
+
headers = {
|
66 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
|
67 |
+
}
|
68 |
|
69 |
+
response = requests.get(url, headers=headers)
|
70 |
+
|
71 |
+
# Save the file
|
72 |
+
with open('AI_vs_AI_Ridge_2000_Samples.pkl', 'wb') as file:
|
73 |
+
file.write(response.content)
|
74 |
|
75 |
|
76 |
|
|
|
88 |
input_paragraph = ' '.join(word_tokenize(input_paragraph)[:num_words])
|
89 |
|
90 |
# Extracting features
|
91 |
+
def extract_features_AI_vs_AI_Ridge_2000_Samples(text):
|
92 |
+
|
93 |
words = word_tokenize(text)
|
94 |
sentences = sent_tokenize(text)
|
95 |
|
|
|
114 |
'pos_WDT': pos_counts['WDT'],
|
115 |
'pos_TO': pos_counts['TO'],
|
116 |
'pos_VB': pos_counts['VB'],
|
117 |
+
'pos_PRP': pos_counts['PRP'],
|
118 |
+
'pos_VBP': pos_counts['VBP'],
|
119 |
'pos_VBG': pos_counts['VBG'],
|
120 |
'pos_.': pos_counts['.'],
|
121 |
'pos_JJ': pos_counts['JJ'],
|
122 |
'pos_NNS': pos_counts['NNS'],
|
123 |
'pos_RB': pos_counts['RB'],
|
124 |
+
'pos_PRP$': pos_counts['PRP$'],
|
125 |
'pos_CC': pos_counts['CC'],
|
126 |
+
'pos_MD': pos_counts['MD'],
|
127 |
'pos_VBN': pos_counts['VBN'],
|
128 |
+
'pos_NNP': pos_counts['NNP'],
|
129 |
}
|
130 |
|
131 |
features = {
|
|
|
136 |
'lemma_count': lemma_count,
|
137 |
'named_entity_count': named_entity_count,
|
138 |
}
|
139 |
+
# features.update(pos_features)
|
140 |
+
features = pd.concat([features, pd.DataFrame(pos_features, index=[0])], axis=1)
|
141 |
|
142 |
return pd.Series(features)
|
143 |
|
144 |
+
# Function from Code(2)
|
145 |
+
def add_vectorized_features(df):
|
146 |
+
vectorizer = CountVectorizer()
|
147 |
+
tfidf_vectorizer = TfidfVectorizer()
|
148 |
+
X_bow = vectorizer.fit_transform(df['paragraph'])
|
149 |
+
X_tfidf = tfidf_vectorizer.fit_transform(df['paragraph'])
|
150 |
+
df_bow = pd.DataFrame(X_bow.toarray(), columns=vectorizer.get_feature_names_out())
|
151 |
+
df_tfidf = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
|
152 |
+
df = pd.concat([df, df_bow, df_tfidf], axis=1)
|
153 |
+
return df
|
154 |
+
|
155 |
+
|
156 |
+
# Function define
|
157 |
+
def AI_vs_AI_RandomForest_88_Samples(df):
|
158 |
+
|
159 |
+
# At this point, the pickle file should exist, either it was already there, or it has been downloaded and extracted.
|
160 |
+
with open('RandomForestClassifier.pkl', 'rb') as file:
|
161 |
+
clf_loaded = pickle.load(file)
|
162 |
+
|
163 |
input_features = df['paragraph'].apply(extract_features)
|
164 |
+
|
165 |
predicted_llm = clf_loaded.predict(input_features)
|
166 |
+
st.write(f"Predicted LLM: {predicted_llm[0]}")
|
167 |
+
|
168 |
|
169 |
predicted_proba = clf_loaded.predict_proba(input_features)
|
170 |
probabilities = predicted_proba[0]
|
|
|
189 |
for llm, prob in prob_dict.items():
|
190 |
st.write(llm + ': ' + prob)
|
191 |
st.progress(float(prob.strip('%'))/100)
|
192 |
+
return
|
193 |
+
|
194 |
+
def AI_vs_AI_Ridge_2000_Samples(df):
|
195 |
+
|
196 |
+
# At this point, the pickle file should exist, either it was already there, or it has been downloaded and extracted.
|
197 |
+
with open('AI_vs_AI_Ridge_2000_Samples.pkl', 'rb') as file:
|
198 |
+
clf_loaded = pickle.load(file)
|
199 |
+
|
200 |
+
|
201 |
+
input_features = df['paragraph'].apply(extract_features_AI_vs_AI_Ridge_2000_Samples)
|
202 |
+
|
203 |
+
# Here, input_features is a DataFrame, not a Series
|
204 |
+
input_features = pd.concat(input_features.values, ignore_index=True)
|
205 |
+
|
206 |
+
# Add new vectorized features
|
207 |
+
df = add_vectorized_features(df)
|
208 |
+
|
209 |
+
# Concatenate input_features and df along columns
|
210 |
+
final_features = pd.concat([input_features, df], axis=1)
|
211 |
+
|
212 |
+
predicted_llm = clf_loaded.predict(final_features)
|
213 |
+
st.write(f"Predicted LLM: {predicted_llm[0]}")
|
214 |
+
|
215 |
+
return
|
216 |
+
|
217 |
+
|
218 |
+
# Creates a button named 'Press me'
|
219 |
+
press_me_button = st.button("Which Model Used?")
|
220 |
+
|
221 |
+
if press_me_button:
|
222 |
+
|
223 |
+
# Use the selected option to control the flow of your application
|
224 |
+
if selected_option == "AI vs AI - RandomForest - 88 Samples":
|
225 |
+
AI_vs_AI_RandomForest_88_Samples(df)
|
226 |
+
|
227 |
+
elif selected_option == "AI vs AI - Ridge - 2000 Samples":
|
228 |
+
AI_vs_AI_Ridge_2000_Samples(df)
|
229 |
+
|
230 |
+
elif selected_option == "AI vs Human":
|
231 |
+
st.write("You selected AI vs Human!")
|
232 |
+
|
233 |
+
|
234 |
+
|
235 |
+
|
236 |
+
|
237 |
|