jaifar530 commited on
Commit
b4ae0dd
·
unverified ·
1 Parent(s): d352c5a
Files changed (1) hide show
  1. app.py +343 -193
app.py CHANGED
@@ -1,4 +1,12 @@
 
1
  import streamlit as st
 
 
 
 
 
 
 
2
  import os
3
  import requests
4
  import pickle
@@ -12,7 +20,6 @@ import numpy as np
12
  from nltk.stem import WordNetLemmatizer
13
  from nltk import ne_chunk, pos_tag, word_tokenize
14
  from nltk.tree import Tree
15
- from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
16
  nltk.download('wordnet')
17
  nltk.download('maxent_ne_chunker')
18
  nltk.download('words')
@@ -22,59 +29,10 @@ nltk.download('punkt')
22
  nltk.download('stopwords')
23
  nltk.download('averaged_perceptron_tagger')
24
 
25
- #version
26
- st.markdown("v1.9")
27
-
28
-
29
- # URL of the text file
30
- url = 'https://jaifar.net/text.txt'
31
-
32
- headers = {
33
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
34
- }
35
-
36
- response = requests.get(url, headers=headers)
37
-
38
- # Check if the request was successful
39
- if response.status_code == 200:
40
- # Read the content of the file
41
- content = response.text
42
-
43
- # Print the content of the file
44
- # print(content)
45
- else:
46
- # Handle the case when the request fails
47
- print('Failed to download the file.')
48
-
49
-
50
-
51
- #title
52
- st.title("Smart Detection System of AI-Generated Text Models")
53
-
54
- #subtitle
55
- st.markdown("This is a POC for Smart Detection System of AI Generated Text Models project (:blue[MSc Data Analytics]), it is a pre-trained model that detect the probablities of using any of the known LLM (chatgpt3, chatgpt4, GoogleBard, HuggingfaceChat)")
56
-
57
- #input text
58
- input_paragraph = st.text_area("Input your text here")
59
- words_counts = word_tokenize(input_paragraph)
60
- final_words = len(words_counts)
61
- st.write('Words counts: ', final_words)
62
-
63
- # Define your options
64
- options = ["AI vs AI - RandomForest - 88 Samples", "AI vs AI - Ridge - 2000 Samples", "AI vs Human"]
65
-
66
- # Create a dropdown menu with "Option 2" as the default
67
- # selected_option = st.selectbox('Select an Option', options, index=1)
68
- selected_option = st.selectbox('Select an Option', options)
69
-
70
-
71
-
72
-
73
-
74
  # Check if the file exists
75
- if not os.path.isfile('AI_vs_AI_Ridge_2000_Samples.pkl'):
76
  # Download the zip file if it doesn't exist
77
- url = 'https://jaifar.net/AI_vs_AI_Ridge_2000_Samples.pkl'
78
  headers = {
79
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
80
  }
@@ -82,8 +40,18 @@ if not os.path.isfile('AI_vs_AI_Ridge_2000_Samples.pkl'):
82
  response = requests.get(url, headers=headers)
83
 
84
  # Save the file
85
- with open('AI_vs_AI_Ridge_2000_Samples.pkl', 'wb') as file2:
86
- file2.write(response.content)
 
 
 
 
 
 
 
 
 
 
87
 
88
 
89
 
@@ -100,9 +68,8 @@ num_words = 500
100
  # Retrieving only the first num_words words of the paragraph
101
  input_paragraph = ' '.join(word_tokenize(input_paragraph)[:num_words])
102
 
103
-
104
  # Extracting features
105
- def extract_features_AI_vs_AI_RandomForest_88_Samples(text):
106
  words = word_tokenize(text)
107
  sentences = sent_tokenize(text)
108
 
@@ -149,182 +116,365 @@ def extract_features_AI_vs_AI_RandomForest_88_Samples(text):
149
  return pd.Series(features)
150
 
151
 
 
 
152
 
153
- # Extracting features for AI_vs_AI_Ridge_2000_Samples
154
- def extract_features_AI_vs_AI_Ridge_2000_Samples(text):
155
-
156
- words = word_tokenize(text)
157
- sentences = sent_tokenize(text)
158
 
159
- avg_word_length = sum(len(word) for word in words if word.isalpha()) / len(words)
160
- avg_sent_length = sum(len(sent) for sent in sentences) / len(sentences)
161
- punctuation_count = len([char for char in text if char in '.,;:?!'])
162
- stopword_count = len([word for word in words if word in stopwords.words('english')])
163
 
164
- lemmatizer = WordNetLemmatizer()
165
- lemma_count = len(set(lemmatizer.lemmatize(word) for word in words))
166
 
167
- named_entity_count = len([chunk for chunk in ne_chunk(pos_tag(words)) if isinstance(chunk, Tree)])
 
168
 
169
- tagged_words = nltk.pos_tag(words)
170
- pos_counts = nltk.FreqDist(tag for (word, tag) in tagged_words)
171
- pos_features = {
172
- 'pos_IN': pos_counts['IN'],
173
- 'pos_DT': pos_counts['DT'],
174
- 'pos_NN': pos_counts['NN'],
175
- 'pos_,': pos_counts[','],
176
- 'pos_VBZ': pos_counts['VBZ'],
177
- 'pos_WDT': pos_counts['WDT'],
178
- 'pos_TO': pos_counts['TO'],
179
- 'pos_VB': pos_counts['VB'],
180
- 'pos_PRP': pos_counts['PRP'],
181
- 'pos_VBP': pos_counts['VBP'],
182
- 'pos_VBG': pos_counts['VBG'],
183
- 'pos_.': pos_counts['.'],
184
- 'pos_JJ': pos_counts['JJ'],
185
- 'pos_NNS': pos_counts['NNS'],
186
- 'pos_RB': pos_counts['RB'],
187
- 'pos_PRP$': pos_counts['PRP$'],
188
- 'pos_CC': pos_counts['CC'],
189
- 'pos_MD': pos_counts['MD'],
190
- 'pos_VBN': pos_counts['VBN'],
191
- 'pos_NNP': pos_counts['NNP'],
192
- }
193
 
194
- features = {
195
- 'avg_word_length': avg_word_length,
196
- 'avg_sent_length': avg_sent_length,
197
- 'punctuation_count': punctuation_count,
198
- 'stopword_count': stopword_count,
199
- 'lemma_count': lemma_count,
200
- 'named_entity_count': named_entity_count,
201
- }
202
- # features.update(pos_features)
203
- features = pd.concat([features, pd.DataFrame(pos_features, index=[0])], axis=1)
204
 
205
- return pd.Series(features)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
 
207
- # Function from Code(2)
208
- def add_vectorized_features(df):
209
- vectorizer = CountVectorizer()
210
- tfidf_vectorizer = TfidfVectorizer()
211
- X_bow = vectorizer.fit_transform(df['paragraph'])
212
- X_tfidf = tfidf_vectorizer.fit_transform(df['paragraph'])
213
- df_bow = pd.DataFrame(X_bow.toarray(), columns=vectorizer.get_feature_names_out())
214
- df_tfidf = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
215
- df = pd.concat([df, df_bow, df_tfidf], axis=1)
216
- return df
217
 
218
 
219
- # Function define AI_vs_AI_RandomForest_88_Samples
220
- def AI_vs_AI_RandomForest_88_Samples(df):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
 
222
 
223
 
224
 
225
- input_features = df['paragraph'].apply(extract_features_AI_vs_AI_RandomForest_88_Samples)
226
- # try:
227
- # predicted_llm = clf_loaded.predict(input_features)
228
- # st.write(f"Predicted LLM: {predicted_llm[0]}")
229
- # predicted_proba = clf_loaded.predict_proba(input_features)
230
- # except Exception as e:
231
- # st.write(f"An error occurred: {str(e)}")
232
 
233
- # labels = clf_loaded.classes_
234
 
235
- # # Create a mapping from old labels to new labels
236
- # label_mapping = {1: 'gpt3', 2: 'gpt4', 3: 'googlebard', 4: 'huggingface'}
237
 
238
- # # Apply the mapping to the labels
239
- # new_labels = [label_mapping[label] for label in labels]
240
 
241
- # # Create a dictionary that maps new labels to probabilities
242
- # prob_dict = {k: v for k, v in zip(new_labels, probabilities)}
243
 
244
- # # Convert probabilities to percentages and sort the dictionary in descending order
245
- # prob_dict = {k: f'{v*100:.2f}%' for k, v in sorted(prob_dict.items(), key=lambda item: item[1], reverse=True)}
246
 
247
- # # Print the dictionary
248
- # #st.write(prob_dict)
249
 
250
- # # Create a progress bar and a bar chart for each LLM
251
- # for llm, prob in prob_dict.items():
252
- # st.write(llm + ': ' + prob)
253
- # st.progress(float(prob.strip('%'))/100)
254
- return
255
 
256
 
257
- def AI_vs_AI_Ridge_2000_Samples(df):
258
 
259
- # At this point, the pickle file should exist, either it was already there, or it has been downloaded and extracted.
260
- with open('AI_vs_AI_Ridge_2000_Samples.pkl', 'rb') as file2:
261
- clf_loaded = pickle.load(file2)
262
 
263
 
264
- input_features = df['paragraph'].apply(extract_features_AI_vs_AI_Ridge_2000_Samples)
265
 
266
- # Here, input_features is a DataFrame, not a Series
267
- input_features = pd.concat(input_features.values, ignore_index=True)
268
 
269
- # Add new vectorized features
270
- df = add_vectorized_features(df)
271
 
272
- # Concatenate input_features and df along columns
273
- final_features = pd.concat([input_features, df], axis=1)
274
 
275
- predicted_llm = clf_loaded.predict(final_features)
276
- st.write(f"Predicted LLM: {predicted_llm[0]}")
277
 
278
- return
279
 
280
 
281
 
282
- # Check if the file exists
283
- if not os.path.isfile('AI_vs_AI_RandomForest_88_Samples.pkl'):
284
- # Download the zip file if it doesn't exist
285
- url = 'https://jaifar.net/AI_vs_AI_RandomForest_88_Samples.pkl'
286
- headers = {
287
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
288
- }
289
 
290
- response = requests.get(url, headers=headers)
291
 
292
- # Save the file
293
- try:
294
- with open('AI_vs_AI_RandomForest_88_Samples.pkl', 'wb') as file:
295
- file.write(response.content)
296
- except Exception as e:
297
- st.write(f"An error occurred while writing AI_vs_AI_RandomForest_88_Samples.pkl: {str(e)}")
298
-
299
- try:
300
- with open('AI_vs_AI_RandomForest_88_Samples.pkl', 'rb') as file:
301
- clf_loaded = pickle.load(file)
302
- except Exception as e:
303
- st.write(f"An error occurred while loading AI_vs_AI_RandomForest_88_Samples.pkl: {str(e)}")
304
-
305
- # Creates a button
306
- press_me_button = st.button("Which Model Used?")
307
 
308
- if press_me_button:
 
 
 
 
 
 
 
 
 
309
 
310
- input_features = df['paragraph'].apply(extract_features_AI_vs_AI_RandomForest_88_Samples)
311
 
312
- try:
313
- predicted_llm = clf_loaded.predict(input_features)
314
- st.write(f"Predicted LLM: {predicted_llm[0]}")
315
- predicted_proba = clf_loaded.predict_proba(input_features)
316
- except Exception as e:
317
- st.write(f"An error occurred: {str(e)}")
318
-
319
- # # Use the selected option to control the flow of your application
320
- # if selected_option == "AI vs AI - RandomForest - 88 Samples":
321
- # AI_vs_AI_RandomForest_88_Samples(df)
322
-
323
- # elif selected_option == "AI vs AI - Ridge - 2000 Samples":
324
- # AI_vs_AI_Ridge_2000_Samples(df)
325
-
326
- # elif selected_option == "AI vs Human":
327
- # st.write("You selected AI vs Human!")
328
 
329
 
330
 
 
1
+
2
  import streamlit as st
3
+
4
+ #title
5
+ st.title("Smart Detection System of AI-Generated Text Models")
6
+
7
+ #subtitle
8
+ st.markdown("This is a POC for Smart Detection System of AI Generated Text Models project (:blue[MSc Data Analytics]), it is a pre-trained model that detect the probablities of using any of the known LLM (chatgpt3, chatgpt4, GoogleBard, HuggingfaceChat)")
9
+
10
  import os
11
  import requests
12
  import pickle
 
20
  from nltk.stem import WordNetLemmatizer
21
  from nltk import ne_chunk, pos_tag, word_tokenize
22
  from nltk.tree import Tree
 
23
  nltk.download('wordnet')
24
  nltk.download('maxent_ne_chunker')
25
  nltk.download('words')
 
29
  nltk.download('stopwords')
30
  nltk.download('averaged_perceptron_tagger')
31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  # Check if the file exists
33
+ if not os.path.isfile('RandomForestClassifier.pkl'):
34
  # Download the zip file if it doesn't exist
35
+ url = 'https://jaifar.net/RandomForestClassifier.pkl'
36
  headers = {
37
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
38
  }
 
40
  response = requests.get(url, headers=headers)
41
 
42
  # Save the file
43
+ with open('RandomForestClassifier.pkl', 'wb') as file:
44
+ file.write(response.content)
45
+
46
+ # At this point, the pickle file should exist, either it was already there, or it has been downloaded and extracted.
47
+ with open('RandomForestClassifier.pkl', 'rb') as file:
48
+ clf_loaded = pickle.load(file)
49
+
50
+
51
+ input_paragraph = st.text_area("Input your text here")
52
+ words_counts = word_tokenize(input_paragraph)
53
+ final_words = len(words_counts)
54
+ st.write('Words counts: ', final_words)
55
 
56
 
57
 
 
68
  # Retrieving only the first num_words words of the paragraph
69
  input_paragraph = ' '.join(word_tokenize(input_paragraph)[:num_words])
70
 
 
71
  # Extracting features
72
+ def extract_features(text):
73
  words = word_tokenize(text)
74
  sentences = sent_tokenize(text)
75
 
 
116
  return pd.Series(features)
117
 
118
 
119
+ # Creates a button named 'Press me'
120
+ press_me_button = st.button("Which Model Used?")
121
 
122
+ if press_me_button:
123
+ input_features = df['paragraph'].apply(extract_features)
124
+ predicted_llm = clf_loaded.predict(input_features)
125
+ #st.write(f"Predicted LLM: {predicted_llm[0]}")
 
126
 
127
+ predicted_proba = clf_loaded.predict_proba(input_features)
128
+ probabilities = predicted_proba[0]
129
+ labels = clf_loaded.classes_
 
130
 
131
+ # Create a mapping from old labels to new labels
132
+ label_mapping = {1: 'gpt3', 2: 'gpt4', 3: 'googlebard', 4: 'huggingface'}
133
 
134
+ # Apply the mapping to the labels
135
+ new_labels = [label_mapping[label] for label in labels]
136
 
137
+ # Create a dictionary that maps new labels to probabilities
138
+ prob_dict = {k: v for k, v in zip(new_labels, probabilities)}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
 
140
+ # Convert probabilities to percentages and sort the dictionary in descending order
141
+ prob_dict = {k: f'{v*100:.2f}%' for k, v in sorted(prob_dict.items(), key=lambda item: item[1], reverse=True)}
 
 
 
 
 
 
 
 
142
 
143
+ # Print the dictionary
144
+ #st.write(prob_dict)
145
+
146
+ # Create a progress bar and a bar chart for each LLM
147
+ for llm, prob in prob_dict.items():
148
+ st.write(llm + ': ' + prob)
149
+ st.progress(float(prob.strip('%'))/100)
150
+
151
+ # import streamlit as st
152
+ # import os
153
+ # import requests
154
+ # import pickle
155
+ # import pandas as pd
156
+ # import nltk
157
+ # import spacy
158
+ # from nltk.corpus import stopwords
159
+ # from nltk.tokenize import word_tokenize, sent_tokenize
160
+ # import numpy as np
161
+ # ############
162
+ # from nltk.stem import WordNetLemmatizer
163
+ # from nltk import ne_chunk, pos_tag, word_tokenize
164
+ # from nltk.tree import Tree
165
+ # from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
166
+ # nltk.download('wordnet')
167
+ # nltk.download('maxent_ne_chunker')
168
+ # nltk.download('words')
169
+
170
+ # #######
171
+ # nltk.download('punkt')
172
+ # nltk.download('stopwords')
173
+ # nltk.download('averaged_perceptron_tagger')
174
+
175
+ # #version
176
+ # st.markdown("v1.9")
177
+
178
+
179
+ # # URL of the text file
180
+ # url = 'https://jaifar.net/text.txt'
181
+
182
+ # headers = {
183
+ # 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
184
+ # }
185
+
186
+ # response = requests.get(url, headers=headers)
187
+
188
+ # # Check if the request was successful
189
+ # if response.status_code == 200:
190
+ # # Read the content of the file
191
+ # content = response.text
192
+
193
+ # # Print the content of the file
194
+ # # print(content)
195
+ # else:
196
+ # # Handle the case when the request fails
197
+ # print('Failed to download the file.')
198
+
199
+
200
+
201
+ # #title
202
+ # st.title("Smart Detection System of AI-Generated Text Models")
203
+
204
+ # #subtitle
205
+ # st.markdown("This is a POC for Smart Detection System of AI Generated Text Models project (:blue[MSc Data Analytics]), it is a pre-trained model that detect the probablities of using any of the known LLM (chatgpt3, chatgpt4, GoogleBard, HuggingfaceChat)")
206
+
207
+ # #input text
208
+ # input_paragraph = st.text_area("Input your text here")
209
+ # words_counts = word_tokenize(input_paragraph)
210
+ # final_words = len(words_counts)
211
+ # st.write('Words counts: ', final_words)
212
+
213
+ # # Define your options
214
+ # options = ["AI vs AI - RandomForest - 88 Samples", "AI vs AI - Ridge - 2000 Samples", "AI vs Human"]
215
+
216
+ # # Create a dropdown menu with "Option 2" as the default
217
+ # # selected_option = st.selectbox('Select an Option', options, index=1)
218
+ # selected_option = st.selectbox('Select an Option', options)
219
 
 
 
 
 
 
 
 
 
 
 
220
 
221
 
222
+
223
+
224
+ # # Check if the file exists
225
+ # if not os.path.isfile('AI_vs_AI_Ridge_2000_Samples.pkl'):
226
+ # # Download the zip file if it doesn't exist
227
+ # url = 'https://jaifar.net/AI_vs_AI_Ridge_2000_Samples.pkl'
228
+ # headers = {
229
+ # 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
230
+ # }
231
+
232
+ # response = requests.get(url, headers=headers)
233
+
234
+ # # Save the file
235
+ # with open('AI_vs_AI_Ridge_2000_Samples.pkl', 'wb') as file2:
236
+ # file2.write(response.content)
237
+
238
+
239
+
240
+ # # df = pd.DataFrame(columns=["paragraph"])
241
+ # # df = df.append({"paragraph": input_paragraph}, ignore_index=True)
242
+
243
+ # df = pd.DataFrame([input_paragraph], columns=["paragraph"])
244
+
245
+
246
+
247
+ # # Variable to control number of words to retrieve
248
+ # num_words = 500
249
+
250
+ # # Retrieving only the first num_words words of the paragraph
251
+ # input_paragraph = ' '.join(word_tokenize(input_paragraph)[:num_words])
252
+
253
+
254
+ # # Extracting features
255
+ # def extract_features_AI_vs_AI_RandomForest_88_Samples(text):
256
+ # words = word_tokenize(text)
257
+ # sentences = sent_tokenize(text)
258
+
259
+ # avg_word_length = sum(len(word) for word in words if word.isalpha()) / len(words)
260
+ # avg_sent_length = sum(len(sent) for sent in sentences) / len(sentences)
261
+ # punctuation_count = len([char for char in text if char in '.,;:?!'])
262
+ # stopword_count = len([word for word in words if word in stopwords.words('english')])
263
+
264
+ # lemmatizer = WordNetLemmatizer()
265
+ # lemma_count = len(set(lemmatizer.lemmatize(word) for word in words))
266
+
267
+ # named_entity_count = len([chunk for chunk in ne_chunk(pos_tag(words)) if isinstance(chunk, Tree)])
268
+
269
+ # tagged_words = nltk.pos_tag(words)
270
+ # pos_counts = nltk.FreqDist(tag for (word, tag) in tagged_words)
271
+ # pos_features = {
272
+ # 'pos_IN': pos_counts['IN'],
273
+ # 'pos_DT': pos_counts['DT'],
274
+ # 'pos_NN': pos_counts['NN'],
275
+ # 'pos_,': pos_counts[','],
276
+ # 'pos_VBZ': pos_counts['VBZ'],
277
+ # 'pos_WDT': pos_counts['WDT'],
278
+ # 'pos_TO': pos_counts['TO'],
279
+ # 'pos_VB': pos_counts['VB'],
280
+ # 'pos_VBG': pos_counts['VBG'],
281
+ # 'pos_.': pos_counts['.'],
282
+ # 'pos_JJ': pos_counts['JJ'],
283
+ # 'pos_NNS': pos_counts['NNS'],
284
+ # 'pos_RB': pos_counts['RB'],
285
+ # 'pos_CC': pos_counts['CC'],
286
+ # 'pos_VBN': pos_counts['VBN'],
287
+ # }
288
+
289
+ # features = {
290
+ # 'avg_word_length': avg_word_length,
291
+ # 'avg_sent_length': avg_sent_length,
292
+ # 'punctuation_count': punctuation_count,
293
+ # 'stopword_count': stopword_count,
294
+ # 'lemma_count': lemma_count,
295
+ # 'named_entity_count': named_entity_count,
296
+ # }
297
+ # features.update(pos_features)
298
+
299
+ # return pd.Series(features)
300
+
301
+
302
+
303
+ # # Extracting features for AI_vs_AI_Ridge_2000_Samples
304
+ # def extract_features_AI_vs_AI_Ridge_2000_Samples(text):
305
+
306
+ # words = word_tokenize(text)
307
+ # sentences = sent_tokenize(text)
308
+
309
+ # avg_word_length = sum(len(word) for word in words if word.isalpha()) / len(words)
310
+ # avg_sent_length = sum(len(sent) for sent in sentences) / len(sentences)
311
+ # punctuation_count = len([char for char in text if char in '.,;:?!'])
312
+ # stopword_count = len([word for word in words if word in stopwords.words('english')])
313
+
314
+ # lemmatizer = WordNetLemmatizer()
315
+ # lemma_count = len(set(lemmatizer.lemmatize(word) for word in words))
316
+
317
+ # named_entity_count = len([chunk for chunk in ne_chunk(pos_tag(words)) if isinstance(chunk, Tree)])
318
+
319
+ # tagged_words = nltk.pos_tag(words)
320
+ # pos_counts = nltk.FreqDist(tag for (word, tag) in tagged_words)
321
+ # pos_features = {
322
+ # 'pos_IN': pos_counts['IN'],
323
+ # 'pos_DT': pos_counts['DT'],
324
+ # 'pos_NN': pos_counts['NN'],
325
+ # 'pos_,': pos_counts[','],
326
+ # 'pos_VBZ': pos_counts['VBZ'],
327
+ # 'pos_WDT': pos_counts['WDT'],
328
+ # 'pos_TO': pos_counts['TO'],
329
+ # 'pos_VB': pos_counts['VB'],
330
+ # 'pos_PRP': pos_counts['PRP'],
331
+ # 'pos_VBP': pos_counts['VBP'],
332
+ # 'pos_VBG': pos_counts['VBG'],
333
+ # 'pos_.': pos_counts['.'],
334
+ # 'pos_JJ': pos_counts['JJ'],
335
+ # 'pos_NNS': pos_counts['NNS'],
336
+ # 'pos_RB': pos_counts['RB'],
337
+ # 'pos_PRP$': pos_counts['PRP$'],
338
+ # 'pos_CC': pos_counts['CC'],
339
+ # 'pos_MD': pos_counts['MD'],
340
+ # 'pos_VBN': pos_counts['VBN'],
341
+ # 'pos_NNP': pos_counts['NNP'],
342
+ # }
343
+
344
+ # features = {
345
+ # 'avg_word_length': avg_word_length,
346
+ # 'avg_sent_length': avg_sent_length,
347
+ # 'punctuation_count': punctuation_count,
348
+ # 'stopword_count': stopword_count,
349
+ # 'lemma_count': lemma_count,
350
+ # 'named_entity_count': named_entity_count,
351
+ # }
352
+ # # features.update(pos_features)
353
+ # features = pd.concat([features, pd.DataFrame(pos_features, index=[0])], axis=1)
354
+
355
+ # return pd.Series(features)
356
+
357
+ # # Function from Code(2)
358
+ # def add_vectorized_features(df):
359
+ # vectorizer = CountVectorizer()
360
+ # tfidf_vectorizer = TfidfVectorizer()
361
+ # X_bow = vectorizer.fit_transform(df['paragraph'])
362
+ # X_tfidf = tfidf_vectorizer.fit_transform(df['paragraph'])
363
+ # df_bow = pd.DataFrame(X_bow.toarray(), columns=vectorizer.get_feature_names_out())
364
+ # df_tfidf = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
365
+ # df = pd.concat([df, df_bow, df_tfidf], axis=1)
366
+ # return df
367
+
368
+
369
+ # # Function define AI_vs_AI_RandomForest_88_Samples
370
+ # def AI_vs_AI_RandomForest_88_Samples(df):
371
 
372
 
373
 
374
 
375
+ # input_features = df['paragraph'].apply(extract_features_AI_vs_AI_RandomForest_88_Samples)
376
+ # # try:
377
+ # # predicted_llm = clf_loaded.predict(input_features)
378
+ # # st.write(f"Predicted LLM: {predicted_llm[0]}")
379
+ # # predicted_proba = clf_loaded.predict_proba(input_features)
380
+ # # except Exception as e:
381
+ # # st.write(f"An error occurred: {str(e)}")
382
 
383
+ # # labels = clf_loaded.classes_
384
 
385
+ # # # Create a mapping from old labels to new labels
386
+ # # label_mapping = {1: 'gpt3', 2: 'gpt4', 3: 'googlebard', 4: 'huggingface'}
387
 
388
+ # # # Apply the mapping to the labels
389
+ # # new_labels = [label_mapping[label] for label in labels]
390
 
391
+ # # # Create a dictionary that maps new labels to probabilities
392
+ # # prob_dict = {k: v for k, v in zip(new_labels, probabilities)}
393
 
394
+ # # # Convert probabilities to percentages and sort the dictionary in descending order
395
+ # # prob_dict = {k: f'{v*100:.2f}%' for k, v in sorted(prob_dict.items(), key=lambda item: item[1], reverse=True)}
396
 
397
+ # # # Print the dictionary
398
+ # # #st.write(prob_dict)
399
 
400
+ # # # Create a progress bar and a bar chart for each LLM
401
+ # # for llm, prob in prob_dict.items():
402
+ # # st.write(llm + ': ' + prob)
403
+ # # st.progress(float(prob.strip('%'))/100)
404
+ # return
405
 
406
 
407
+ # def AI_vs_AI_Ridge_2000_Samples(df):
408
 
409
+ # # At this point, the pickle file should exist, either it was already there, or it has been downloaded and extracted.
410
+ # with open('AI_vs_AI_Ridge_2000_Samples.pkl', 'rb') as file2:
411
+ # clf_loaded = pickle.load(file2)
412
 
413
 
414
+ # input_features = df['paragraph'].apply(extract_features_AI_vs_AI_Ridge_2000_Samples)
415
 
416
+ # # Here, input_features is a DataFrame, not a Series
417
+ # input_features = pd.concat(input_features.values, ignore_index=True)
418
 
419
+ # # Add new vectorized features
420
+ # df = add_vectorized_features(df)
421
 
422
+ # # Concatenate input_features and df along columns
423
+ # final_features = pd.concat([input_features, df], axis=1)
424
 
425
+ # predicted_llm = clf_loaded.predict(final_features)
426
+ # st.write(f"Predicted LLM: {predicted_llm[0]}")
427
 
428
+ # return
429
 
430
 
431
 
432
+ # # Check if the file exists
433
+ # if not os.path.isfile('AI_vs_AI_RandomForest_88_Samples.pkl'):
434
+ # # Download the zip file if it doesn't exist
435
+ # url = 'https://jaifar.net/AI_vs_AI_RandomForest_88_Samples.pkl'
436
+ # headers = {
437
+ # 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
438
+ # }
439
 
440
+ # response = requests.get(url, headers=headers)
441
 
442
+ # # Save the file
443
+ # try:
444
+ # with open('AI_vs_AI_RandomForest_88_Samples.pkl', 'wb') as file:
445
+ # file.write(response.content)
446
+ # except Exception as e:
447
+ # st.write(f"An error occurred while writing AI_vs_AI_RandomForest_88_Samples.pkl: {str(e)}")
 
 
 
 
 
 
 
 
 
448
 
449
+ # try:
450
+ # with open('AI_vs_AI_RandomForest_88_Samples.pkl', 'rb') as file:
451
+ # clf_loaded = pickle.load(file)
452
+ # except Exception as e:
453
+ # st.write(f"An error occurred while loading AI_vs_AI_RandomForest_88_Samples.pkl: {str(e)}")
454
+
455
+ # # Creates a button
456
+ # press_me_button = st.button("Which Model Used?")
457
+
458
+ # if press_me_button:
459
 
460
+ # input_features = df['paragraph'].apply(extract_features_AI_vs_AI_RandomForest_88_Samples)
461
 
462
+ # try:
463
+ # predicted_llm = clf_loaded.predict(input_features)
464
+ # st.write(f"Predicted LLM: {predicted_llm[0]}")
465
+ # predicted_proba = clf_loaded.predict_proba(input_features)
466
+ # except Exception as e:
467
+ # st.write(f"An error occurred: {str(e)}")
468
+
469
+ # # # Use the selected option to control the flow of your application
470
+ # # if selected_option == "AI vs AI - RandomForest - 88 Samples":
471
+ # # AI_vs_AI_RandomForest_88_Samples(df)
472
+
473
+ # # elif selected_option == "AI vs AI - Ridge - 2000 Samples":
474
+ # # AI_vs_AI_Ridge_2000_Samples(df)
475
+
476
+ # # elif selected_option == "AI vs Human":
477
+ # # st.write("You selected AI vs Human!")
478
 
479
 
480