jaifar530 commited on
Commit
e5f5906
·
unverified ·
1 Parent(s): d0ecce0

new version

Browse files
Files changed (1) hide show
  1. app.py +206 -102
app.py CHANGED
@@ -1,40 +1,30 @@
1
 
 
2
  import streamlit as st
3
  #subtitle
4
- st.markdown("version: 1.2")
5
  #title
6
  st.title("Smart Detection System of AI-Generated Text Models")
7
 
8
  #subtitle
9
  st.markdown("This is a POC for Smart Detection System of AI Generated Text Models project (:blue[MSc Data Analytics]), it is a pre-trained model that detect the probablities of using any of the known LLM (chatgpt3, chatgpt4, GoogleBard, HuggingfaceChat)")
10
 
 
 
 
 
11
  import os
12
  import requests
13
- # import pickle
14
- import pandas as pd
15
- import nltk
16
- import spacy
17
- from nltk.corpus import stopwords
18
- from nltk.tokenize import word_tokenize, sent_tokenize
19
  import numpy as np
20
  ############
21
- from nltk.stem import WordNetLemmatizer
22
- from nltk import ne_chunk, pos_tag, word_tokenize
23
- from nltk.tree import Tree
24
- from joblib import dump, load
25
- nltk.download('wordnet')
26
- nltk.download('maxent_ne_chunker')
27
- nltk.download('words')
28
-
29
- #######
30
- nltk.download('punkt')
31
- nltk.download('stopwords')
32
- nltk.download('averaged_perceptron_tagger')
33
 
34
  # Check if the file exists
35
- if not os.path.isfile('RandomForestClassifier.joblib'):
36
 
37
- url = 'https://jaifar.net/RandomForestClassifier.joblib'
38
  headers = {
39
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
40
  }
@@ -42,116 +32,230 @@ if not os.path.isfile('RandomForestClassifier.joblib'):
42
  response = requests.get(url, headers=headers)
43
 
44
 
45
- with open('RandomForestClassifier.joblib', 'wb') as file:
46
  file.write(response.content)
47
 
48
 
49
- # Load the model from the file
50
- clf_loaded = load('RandomForestClassifier.joblib')
51
-
52
- # # At this point, the pickle file should exist, either it was already there, or it has been downloaded and extracted.
53
- # with open('RandomForestClassifier.pkl', 'rb') as file:
54
- # clf_loaded = pickle.load(file)
55
 
56
  input_paragraph = st.text_area("Input your text here")
57
  words_counts = word_tokenize(input_paragraph)
58
  final_words = len(words_counts)
59
  st.write('Words counts: ', final_words)
60
 
61
-
62
-
63
- # df = pd.DataFrame(columns=["paragraph"])
64
- # df = df.append({"paragraph": input_paragraph}, ignore_index=True)
65
 
66
  df = pd.DataFrame([input_paragraph], columns=["paragraph"])
67
 
68
-
69
-
70
- # Variable to control number of words to retrieve
71
- num_words = 500
72
-
73
- # Retrieving only the first num_words words of the paragraph
74
- input_paragraph = ' '.join(word_tokenize(input_paragraph)[:num_words])
75
-
76
  # Extracting features
77
  def extract_features(text):
78
- words = word_tokenize(text)
79
- sentences = sent_tokenize(text)
80
-
81
- avg_word_length = sum(len(word) for word in words if word.isalpha()) / len(words)
82
- avg_sent_length = sum(len(sent) for sent in sentences) / len(sentences)
83
- punctuation_count = len([char for char in text if char in '.,;:?!'])
84
- stopword_count = len([word for word in words if word in stopwords.words('english')])
85
-
86
- lemmatizer = WordNetLemmatizer()
87
- lemma_count = len(set(lemmatizer.lemmatize(word) for word in words))
88
-
89
- named_entity_count = len([chunk for chunk in ne_chunk(pos_tag(words)) if isinstance(chunk, Tree)])
90
-
91
- tagged_words = nltk.pos_tag(words)
92
- pos_counts = nltk.FreqDist(tag for (word, tag) in tagged_words)
93
- pos_features = {
94
- 'pos_IN': pos_counts['IN'],
95
- 'pos_DT': pos_counts['DT'],
96
- 'pos_NN': pos_counts['NN'],
97
- 'pos_,': pos_counts[','],
98
- 'pos_VBZ': pos_counts['VBZ'],
99
- 'pos_WDT': pos_counts['WDT'],
100
- 'pos_TO': pos_counts['TO'],
101
- 'pos_VB': pos_counts['VB'],
102
- 'pos_VBG': pos_counts['VBG'],
103
- 'pos_.': pos_counts['.'],
104
- 'pos_JJ': pos_counts['JJ'],
105
- 'pos_NNS': pos_counts['NNS'],
106
- 'pos_RB': pos_counts['RB'],
107
- 'pos_CC': pos_counts['CC'],
108
- 'pos_VBN': pos_counts['VBN'],
109
- }
110
 
111
- features = {
112
- 'avg_word_length': avg_word_length,
113
- 'avg_sent_length': avg_sent_length,
114
- 'punctuation_count': punctuation_count,
115
- 'stopword_count': stopword_count,
116
- 'lemma_count': lemma_count,
117
- 'named_entity_count': named_entity_count,
118
- }
119
- features.update(pos_features)
120
 
121
- return pd.Series(features)
 
122
 
 
 
 
 
 
123
 
124
- # Creates a button named 'Press me'
125
- press_me_button = st.button("Which Model Used?")
126
 
127
  if press_me_button:
128
  input_features = df['paragraph'].apply(extract_features)
129
  predicted_llm = clf_loaded.predict(input_features)
130
- #st.write(f"Predicted LLM: {predicted_llm[0]}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
 
132
- predicted_proba = clf_loaded.predict_proba(input_features)
133
- probabilities = predicted_proba[0]
134
- labels = clf_loaded.classes_
135
 
136
- # Create a mapping from old labels to new labels
137
- label_mapping = {1: 'gpt3', 2: 'gpt4', 3: 'googlebard', 4: 'huggingface'}
138
 
139
- # Apply the mapping to the labels
140
- new_labels = [label_mapping[label] for label in labels]
141
 
142
- # Create a dictionary that maps new labels to probabilities
143
- prob_dict = {k: v for k, v in zip(new_labels, probabilities)}
144
 
145
- # Convert probabilities to percentages and sort the dictionary in descending order
146
- prob_dict = {k: f'{v*100:.2f}%' for k, v in sorted(prob_dict.items(), key=lambda item: item[1], reverse=True)}
 
 
147
 
148
- # Print the dictionary
149
- #st.write(prob_dict)
150
 
151
- # Create a progress bar and a bar chart for each LLM
152
- for llm, prob in prob_dict.items():
153
- st.write(llm + ': ' + prob)
154
- st.progress(float(prob.strip('%'))/100)
155
 
156
  # import streamlit as st
157
  # import os
 
1
 
2
+
3
  import streamlit as st
4
  #subtitle
5
+ st.markdown("version: 2.0")
6
  #title
7
  st.title("Smart Detection System of AI-Generated Text Models")
8
 
9
  #subtitle
10
  st.markdown("This is a POC for Smart Detection System of AI Generated Text Models project (:blue[MSc Data Analytics]), it is a pre-trained model that detect the probablities of using any of the known LLM (chatgpt3, chatgpt4, GoogleBard, HuggingfaceChat)")
11
 
12
+ import pickle
13
+ import pandas as pd
14
+ from sklearn.feature_extraction.text import CountVectorizer
15
+ from sklearn.linear_model import RidgeClassifier
16
  import os
17
  import requests
18
+
 
 
 
 
 
19
  import numpy as np
20
  ############
21
+
22
+
 
 
 
 
 
 
 
 
 
 
23
 
24
  # Check if the file exists
25
+ if not os.path.isfile('ridge_100%_BOW_ngram_full_text.pkl'):
26
 
27
+ url = 'https://jaifar.net/ridge_100%_BOW_ngram_full_text.pkl'
28
  headers = {
29
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
30
  }
 
32
  response = requests.get(url, headers=headers)
33
 
34
 
35
+ with open('ridge_100%_BOW_ngram_full_text.pkl', 'wb') as file:
36
  file.write(response.content)
37
 
38
 
39
+ # At this point, the pickle file should exist, either it was already there, or it has been downloaded and extracted.
40
+ with open('ridge_100%_BOW_ngram_full_text.pkl', 'rb') as file:
41
+ clf_loaded = pickle.load(file)
 
 
 
42
 
43
  input_paragraph = st.text_area("Input your text here")
44
  words_counts = word_tokenize(input_paragraph)
45
  final_words = len(words_counts)
46
  st.write('Words counts: ', final_words)
47
 
48
+ # Creates a button named 'Press me'
49
+ press_me_button = st.button("Which Model Used?")
 
 
50
 
51
  df = pd.DataFrame([input_paragraph], columns=["paragraph"])
52
 
 
 
 
 
 
 
 
 
53
  # Extracting features
54
  def extract_features(text):
55
+ vectorizer = CountVectorizer(ngram_range=(1, 2))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
+ # Convert the paragraphs into a matrix of token counts
58
+ X_vect = vectorizer.fit_transform(text)
 
 
 
 
 
 
 
59
 
60
+ # Get the feature names
61
+ feature_names = vectorizer.get_feature_names_out()
62
 
63
+ # Convert the matrix to a DataFrame
64
+ X_df = pd.DataFrame(X_vect.toarray(), columns=feature_names)
65
+
66
+
67
+ return pd.Series(X_df)
68
 
 
 
69
 
70
  if press_me_button:
71
  input_features = df['paragraph'].apply(extract_features)
72
  predicted_llm = clf_loaded.predict(input_features)
73
+ st.write(f"Predicted LLM: {predicted_llm[0]}")
74
+
75
+ # predicted_proba = clf_loaded.predict_proba(input_features)
76
+ # probabilities = predicted_proba[0]
77
+ # labels = clf_loaded.classes_
78
+
79
+ # # Create a mapping from old labels to new labels
80
+ # label_mapping = {1: 'gpt3', 2: 'gpt4', 3: 'googlebard', 4: 'huggingface'}
81
+
82
+ # # Apply the mapping to the labels
83
+ # new_labels = [label_mapping[label] for label in labels]
84
+
85
+ # # Create a dictionary that maps new labels to probabilities
86
+ # prob_dict = {k: v for k, v in zip(new_labels, probabilities)}
87
+
88
+ # # Convert probabilities to percentages and sort the dictionary in descending order
89
+ # prob_dict = {k: f'{v*100:.2f}%' for k, v in sorted(prob_dict.items(), key=lambda item: item[1], reverse=True)}
90
+
91
+ # # Print the dictionary
92
+ # #st.write(prob_dict)
93
+
94
+ # # Create a progress bar and a bar chart for each LLM
95
+ # for llm, prob in prob_dict.items():
96
+ # st.write(llm + ': ' + prob)
97
+ # st.progress(float(prob.strip('%'))/100)
98
+
99
+
100
+
101
+ #####################################################################
102
+
103
+ # import streamlit as st
104
+ # #subtitle
105
+ # st.markdown("version: 1.2")
106
+ # #title
107
+ # st.title("Smart Detection System of AI-Generated Text Models")
108
+
109
+ # #subtitle
110
+ # st.markdown("This is a POC for Smart Detection System of AI Generated Text Models project (:blue[MSc Data Analytics]), it is a pre-trained model that detect the probablities of using any of the known LLM (chatgpt3, chatgpt4, GoogleBard, HuggingfaceChat)")
111
+
112
+ # import os
113
+ # import requests
114
+ # # import pickle
115
+ # import pandas as pd
116
+ # import nltk
117
+ # import spacy
118
+ # from nltk.corpus import stopwords
119
+ # from nltk.tokenize import word_tokenize, sent_tokenize
120
+ # import numpy as np
121
+ # ############
122
+ # from nltk.stem import WordNetLemmatizer
123
+ # from nltk import ne_chunk, pos_tag, word_tokenize
124
+ # from nltk.tree import Tree
125
+ # from joblib import dump, load
126
+ # nltk.download('wordnet')
127
+ # nltk.download('maxent_ne_chunker')
128
+ # nltk.download('words')
129
+
130
+ # #######
131
+ # nltk.download('punkt')
132
+ # nltk.download('stopwords')
133
+ # nltk.download('averaged_perceptron_tagger')
134
+
135
+ # # Check if the file exists
136
+ # if not os.path.isfile('RandomForestClassifier.joblib'):
137
+
138
+ # url = 'https://jaifar.net/RandomForestClassifier.joblib'
139
+ # headers = {
140
+ # 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
141
+ # }
142
+
143
+ # response = requests.get(url, headers=headers)
144
+
145
+
146
+ # with open('RandomForestClassifier.joblib', 'wb') as file:
147
+ # file.write(response.content)
148
+
149
+
150
+ # # Load the model from the file
151
+ # clf_loaded = load('RandomForestClassifier.joblib')
152
+
153
+ # # # At this point, the pickle file should exist, either it was already there, or it has been downloaded and extracted.
154
+ # # with open('RandomForestClassifier.pkl', 'rb') as file:
155
+ # # clf_loaded = pickle.load(file)
156
+
157
+ # input_paragraph = st.text_area("Input your text here")
158
+ # words_counts = word_tokenize(input_paragraph)
159
+ # final_words = len(words_counts)
160
+ # st.write('Words counts: ', final_words)
161
+
162
+
163
+
164
+ # # df = pd.DataFrame(columns=["paragraph"])
165
+ # # df = df.append({"paragraph": input_paragraph}, ignore_index=True)
166
+
167
+ # df = pd.DataFrame([input_paragraph], columns=["paragraph"])
168
+
169
+
170
+
171
+ # # Variable to control number of words to retrieve
172
+ # num_words = 500
173
+
174
+ # # Retrieving only the first num_words words of the paragraph
175
+ # input_paragraph = ' '.join(word_tokenize(input_paragraph)[:num_words])
176
+
177
+ # # Extracting features
178
+ # def extract_features(text):
179
+ # words = word_tokenize(text)
180
+ # sentences = sent_tokenize(text)
181
+
182
+ # avg_word_length = sum(len(word) for word in words if word.isalpha()) / len(words)
183
+ # avg_sent_length = sum(len(sent) for sent in sentences) / len(sentences)
184
+ # punctuation_count = len([char for char in text if char in '.,;:?!'])
185
+ # stopword_count = len([word for word in words if word in stopwords.words('english')])
186
+
187
+ # lemmatizer = WordNetLemmatizer()
188
+ # lemma_count = len(set(lemmatizer.lemmatize(word) for word in words))
189
+
190
+ # named_entity_count = len([chunk for chunk in ne_chunk(pos_tag(words)) if isinstance(chunk, Tree)])
191
+
192
+ # tagged_words = nltk.pos_tag(words)
193
+ # pos_counts = nltk.FreqDist(tag for (word, tag) in tagged_words)
194
+ # pos_features = {
195
+ # 'pos_IN': pos_counts['IN'],
196
+ # 'pos_DT': pos_counts['DT'],
197
+ # 'pos_NN': pos_counts['NN'],
198
+ # 'pos_,': pos_counts[','],
199
+ # 'pos_VBZ': pos_counts['VBZ'],
200
+ # 'pos_WDT': pos_counts['WDT'],
201
+ # 'pos_TO': pos_counts['TO'],
202
+ # 'pos_VB': pos_counts['VB'],
203
+ # 'pos_VBG': pos_counts['VBG'],
204
+ # 'pos_.': pos_counts['.'],
205
+ # 'pos_JJ': pos_counts['JJ'],
206
+ # 'pos_NNS': pos_counts['NNS'],
207
+ # 'pos_RB': pos_counts['RB'],
208
+ # 'pos_CC': pos_counts['CC'],
209
+ # 'pos_VBN': pos_counts['VBN'],
210
+ # }
211
+
212
+ # features = {
213
+ # 'avg_word_length': avg_word_length,
214
+ # 'avg_sent_length': avg_sent_length,
215
+ # 'punctuation_count': punctuation_count,
216
+ # 'stopword_count': stopword_count,
217
+ # 'lemma_count': lemma_count,
218
+ # 'named_entity_count': named_entity_count,
219
+ # }
220
+ # features.update(pos_features)
221
+
222
+ # return pd.Series(features)
223
+
224
+
225
+ # # Creates a button named 'Press me'
226
+ # press_me_button = st.button("Which Model Used?")
227
+
228
+ # if press_me_button:
229
+ # input_features = df['paragraph'].apply(extract_features)
230
+ # predicted_llm = clf_loaded.predict(input_features)
231
+ # #st.write(f"Predicted LLM: {predicted_llm[0]}")
232
+
233
+ # predicted_proba = clf_loaded.predict_proba(input_features)
234
+ # probabilities = predicted_proba[0]
235
+ # labels = clf_loaded.classes_
236
+
237
+ # # Create a mapping from old labels to new labels
238
+ # label_mapping = {1: 'gpt3', 2: 'gpt4', 3: 'googlebard', 4: 'huggingface'}
239
 
240
+ # # Apply the mapping to the labels
241
+ # new_labels = [label_mapping[label] for label in labels]
 
242
 
243
+ # # Create a dictionary that maps new labels to probabilities
244
+ # prob_dict = {k: v for k, v in zip(new_labels, probabilities)}
245
 
246
+ # # Convert probabilities to percentages and sort the dictionary in descending order
247
+ # prob_dict = {k: f'{v*100:.2f}%' for k, v in sorted(prob_dict.items(), key=lambda item: item[1], reverse=True)}
248
 
249
+ # # Print the dictionary
250
+ # #st.write(prob_dict)
251
 
252
+ # # Create a progress bar and a bar chart for each LLM
253
+ # for llm, prob in prob_dict.items():
254
+ # st.write(llm + ': ' + prob)
255
+ # st.progress(float(prob.strip('%'))/100)
256
 
257
+ ############################################################
 
258
 
 
 
 
 
259
 
260
  # import streamlit as st
261
  # import os