jaifar530 commited on
Commit
ac163d9
·
unverified ·
1 Parent(s): e5f5906

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +122 -581
app.py CHANGED
@@ -1,592 +1,133 @@
1
-
2
-
3
- import streamlit as st
4
- #subtitle
5
- st.markdown("version: 2.0")
6
- #title
7
- st.title("Smart Detection System of AI-Generated Text Models")
8
-
9
- #subtitle
10
- st.markdown("This is a POC for Smart Detection System of AI Generated Text Models project (:blue[MSc Data Analytics]), it is a pre-trained model that detect the probablities of using any of the known LLM (chatgpt3, chatgpt4, GoogleBard, HuggingfaceChat)")
11
-
12
- import pickle
13
- import pandas as pd
14
- from sklearn.feature_extraction.text import CountVectorizer
15
- from sklearn.linear_model import RidgeClassifier
16
  import os
17
  import requests
18
-
 
 
 
 
 
 
19
  import numpy as np
20
- ############
21
-
22
-
23
-
24
- # Check if the file exists
25
- if not os.path.isfile('ridge_100%_BOW_ngram_full_text.pkl'):
26
 
27
- url = 'https://jaifar.net/ridge_100%_BOW_ngram_full_text.pkl'
28
- headers = {
29
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
30
- }
31
-
32
- response = requests.get(url, headers=headers)
33
-
34
-
35
- with open('ridge_100%_BOW_ngram_full_text.pkl', 'wb') as file:
36
- file.write(response.content)
37
-
38
-
39
- # At this point, the pickle file should exist, either it was already there, or it has been downloaded and extracted.
40
- with open('ridge_100%_BOW_ngram_full_text.pkl', 'rb') as file:
41
- clf_loaded = pickle.load(file)
42
-
43
- input_paragraph = st.text_area("Input your text here")
44
- words_counts = word_tokenize(input_paragraph)
45
- final_words = len(words_counts)
46
- st.write('Words counts: ', final_words)
47
-
48
- # Creates a button named 'Press me'
49
- press_me_button = st.button("Which Model Used?")
50
-
51
- df = pd.DataFrame([input_paragraph], columns=["paragraph"])
52
-
53
- # Extracting features
54
- def extract_features(text):
55
- vectorizer = CountVectorizer(ngram_range=(1, 2))
56
-
57
- # Convert the paragraphs into a matrix of token counts
58
- X_vect = vectorizer.fit_transform(text)
59
-
60
- # Get the feature names
61
- feature_names = vectorizer.get_feature_names_out()
62
-
63
- # Convert the matrix to a DataFrame
64
- X_df = pd.DataFrame(X_vect.toarray(), columns=feature_names)
65
-
66
-
67
- return pd.Series(X_df)
68
-
69
-
70
- if press_me_button:
71
- input_features = df['paragraph'].apply(extract_features)
72
- predicted_llm = clf_loaded.predict(input_features)
73
- st.write(f"Predicted LLM: {predicted_llm[0]}")
74
-
75
- # predicted_proba = clf_loaded.predict_proba(input_features)
76
- # probabilities = predicted_proba[0]
77
- # labels = clf_loaded.classes_
78
-
79
- # # Create a mapping from old labels to new labels
80
- # label_mapping = {1: 'gpt3', 2: 'gpt4', 3: 'googlebard', 4: 'huggingface'}
81
-
82
- # # Apply the mapping to the labels
83
- # new_labels = [label_mapping[label] for label in labels]
84
-
85
- # # Create a dictionary that maps new labels to probabilities
86
- # prob_dict = {k: v for k, v in zip(new_labels, probabilities)}
87
-
88
- # # Convert probabilities to percentages and sort the dictionary in descending order
89
- # prob_dict = {k: f'{v*100:.2f}%' for k, v in sorted(prob_dict.items(), key=lambda item: item[1], reverse=True)}
90
-
91
- # # Print the dictionary
92
- # #st.write(prob_dict)
93
-
94
- # # Create a progress bar and a bar chart for each LLM
95
- # for llm, prob in prob_dict.items():
96
- # st.write(llm + ': ' + prob)
97
- # st.progress(float(prob.strip('%'))/100)
98
-
99
-
100
-
101
- #####################################################################
102
-
103
- # import streamlit as st
104
- # #subtitle
105
- # st.markdown("version: 1.2")
106
- # #title
107
- # st.title("Smart Detection System of AI-Generated Text Models")
108
-
109
- # #subtitle
110
- # st.markdown("This is a POC for Smart Detection System of AI Generated Text Models project (:blue[MSc Data Analytics]), it is a pre-trained model that detect the probablities of using any of the known LLM (chatgpt3, chatgpt4, GoogleBard, HuggingfaceChat)")
111
-
112
- # import os
113
- # import requests
114
- # # import pickle
115
- # import pandas as pd
116
- # import nltk
117
- # import spacy
118
- # from nltk.corpus import stopwords
119
- # from nltk.tokenize import word_tokenize, sent_tokenize
120
- # import numpy as np
121
- # ############
122
- # from nltk.stem import WordNetLemmatizer
123
- # from nltk import ne_chunk, pos_tag, word_tokenize
124
- # from nltk.tree import Tree
125
- # from joblib import dump, load
126
- # nltk.download('wordnet')
127
- # nltk.download('maxent_ne_chunker')
128
- # nltk.download('words')
129
-
130
- # #######
131
- # nltk.download('punkt')
132
- # nltk.download('stopwords')
133
- # nltk.download('averaged_perceptron_tagger')
134
-
135
- # # Check if the file exists
136
- # if not os.path.isfile('RandomForestClassifier.joblib'):
137
-
138
- # url = 'https://jaifar.net/RandomForestClassifier.joblib'
139
- # headers = {
140
- # 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
141
- # }
142
-
143
- # response = requests.get(url, headers=headers)
144
-
145
-
146
- # with open('RandomForestClassifier.joblib', 'wb') as file:
147
- # file.write(response.content)
148
-
149
-
150
- # # Load the model from the file
151
- # clf_loaded = load('RandomForestClassifier.joblib')
152
-
153
- # # # At this point, the pickle file should exist, either it was already there, or it has been downloaded and extracted.
154
- # # with open('RandomForestClassifier.pkl', 'rb') as file:
155
- # # clf_loaded = pickle.load(file)
156
-
157
- # input_paragraph = st.text_area("Input your text here")
158
- # words_counts = word_tokenize(input_paragraph)
159
- # final_words = len(words_counts)
160
- # st.write('Words counts: ', final_words)
161
-
162
-
163
-
164
- # # df = pd.DataFrame(columns=["paragraph"])
165
- # # df = df.append({"paragraph": input_paragraph}, ignore_index=True)
166
-
167
- # df = pd.DataFrame([input_paragraph], columns=["paragraph"])
168
-
169
-
170
-
171
- # # Variable to control number of words to retrieve
172
- # num_words = 500
173
-
174
- # # Retrieving only the first num_words words of the paragraph
175
- # input_paragraph = ' '.join(word_tokenize(input_paragraph)[:num_words])
176
-
177
- # # Extracting features
178
- # def extract_features(text):
179
- # words = word_tokenize(text)
180
- # sentences = sent_tokenize(text)
181
-
182
- # avg_word_length = sum(len(word) for word in words if word.isalpha()) / len(words)
183
- # avg_sent_length = sum(len(sent) for sent in sentences) / len(sentences)
184
- # punctuation_count = len([char for char in text if char in '.,;:?!'])
185
- # stopword_count = len([word for word in words if word in stopwords.words('english')])
186
-
187
- # lemmatizer = WordNetLemmatizer()
188
- # lemma_count = len(set(lemmatizer.lemmatize(word) for word in words))
189
-
190
- # named_entity_count = len([chunk for chunk in ne_chunk(pos_tag(words)) if isinstance(chunk, Tree)])
191
-
192
- # tagged_words = nltk.pos_tag(words)
193
- # pos_counts = nltk.FreqDist(tag for (word, tag) in tagged_words)
194
- # pos_features = {
195
- # 'pos_IN': pos_counts['IN'],
196
- # 'pos_DT': pos_counts['DT'],
197
- # 'pos_NN': pos_counts['NN'],
198
- # 'pos_,': pos_counts[','],
199
- # 'pos_VBZ': pos_counts['VBZ'],
200
- # 'pos_WDT': pos_counts['WDT'],
201
- # 'pos_TO': pos_counts['TO'],
202
- # 'pos_VB': pos_counts['VB'],
203
- # 'pos_VBG': pos_counts['VBG'],
204
- # 'pos_.': pos_counts['.'],
205
- # 'pos_JJ': pos_counts['JJ'],
206
- # 'pos_NNS': pos_counts['NNS'],
207
- # 'pos_RB': pos_counts['RB'],
208
- # 'pos_CC': pos_counts['CC'],
209
- # 'pos_VBN': pos_counts['VBN'],
210
- # }
211
-
212
- # features = {
213
- # 'avg_word_length': avg_word_length,
214
- # 'avg_sent_length': avg_sent_length,
215
- # 'punctuation_count': punctuation_count,
216
- # 'stopword_count': stopword_count,
217
- # 'lemma_count': lemma_count,
218
- # 'named_entity_count': named_entity_count,
219
- # }
220
- # features.update(pos_features)
221
-
222
- # return pd.Series(features)
223
-
224
-
225
- # # Creates a button named 'Press me'
226
- # press_me_button = st.button("Which Model Used?")
227
-
228
- # if press_me_button:
229
- # input_features = df['paragraph'].apply(extract_features)
230
- # predicted_llm = clf_loaded.predict(input_features)
231
- # #st.write(f"Predicted LLM: {predicted_llm[0]}")
232
-
233
- # predicted_proba = clf_loaded.predict_proba(input_features)
234
- # probabilities = predicted_proba[0]
235
- # labels = clf_loaded.classes_
236
-
237
- # # Create a mapping from old labels to new labels
238
- # label_mapping = {1: 'gpt3', 2: 'gpt4', 3: 'googlebard', 4: 'huggingface'}
239
-
240
- # # Apply the mapping to the labels
241
- # new_labels = [label_mapping[label] for label in labels]
242
-
243
- # # Create a dictionary that maps new labels to probabilities
244
- # prob_dict = {k: v for k, v in zip(new_labels, probabilities)}
245
-
246
- # # Convert probabilities to percentages and sort the dictionary in descending order
247
- # prob_dict = {k: f'{v*100:.2f}%' for k, v in sorted(prob_dict.items(), key=lambda item: item[1], reverse=True)}
248
-
249
- # # Print the dictionary
250
- # #st.write(prob_dict)
251
-
252
- # # Create a progress bar and a bar chart for each LLM
253
- # for llm, prob in prob_dict.items():
254
- # st.write(llm + ': ' + prob)
255
- # st.progress(float(prob.strip('%'))/100)
256
-
257
- ############################################################
258
-
259
-
260
- # import streamlit as st
261
- # import os
262
- # import requests
263
- # import pickle
264
- # import pandas as pd
265
- # import nltk
266
- # import spacy
267
- # from nltk.corpus import stopwords
268
- # from nltk.tokenize import word_tokenize, sent_tokenize
269
- # import numpy as np
270
- # ############
271
- # from nltk.stem import WordNetLemmatizer
272
- # from nltk import ne_chunk, pos_tag, word_tokenize
273
- # from nltk.tree import Tree
274
- # from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
275
- # nltk.download('wordnet')
276
- # nltk.download('maxent_ne_chunker')
277
- # nltk.download('words')
278
-
279
- # #######
280
- # nltk.download('punkt')
281
- # nltk.download('stopwords')
282
- # nltk.download('averaged_perceptron_tagger')
283
-
284
- # #version
285
- # st.markdown("v1.9")
286
-
287
-
288
- # # URL of the text file
289
- # url = 'https://jaifar.net/text.txt'
290
-
291
- # headers = {
292
- # 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
293
- # }
294
-
295
- # response = requests.get(url, headers=headers)
296
-
297
- # # Check if the request was successful
298
- # if response.status_code == 200:
299
- # # Read the content of the file
300
- # content = response.text
301
-
302
- # # Print the content of the file
303
- # # print(content)
304
- # else:
305
- # # Handle the case when the request fails
306
- # print('Failed to download the file.')
307
-
308
-
309
-
310
- # #title
311
- # st.title("Smart Detection System of AI-Generated Text Models")
312
-
313
- # #subtitle
314
- # st.markdown("This is a POC for Smart Detection System of AI Generated Text Models project (:blue[MSc Data Analytics]), it is a pre-trained model that detect the probablities of using any of the known LLM (chatgpt3, chatgpt4, GoogleBard, HuggingfaceChat)")
315
-
316
- # #input text
317
- # input_paragraph = st.text_area("Input your text here")
318
- # words_counts = word_tokenize(input_paragraph)
319
- # final_words = len(words_counts)
320
- # st.write('Words counts: ', final_words)
321
-
322
- # # Define your options
323
- # options = ["AI vs AI - RandomForest - 88 Samples", "AI vs AI - Ridge - 2000 Samples", "AI vs Human"]
324
-
325
- # # Create a dropdown menu with "Option 2" as the default
326
- # # selected_option = st.selectbox('Select an Option', options, index=1)
327
- # selected_option = st.selectbox('Select an Option', options)
328
-
329
-
330
-
331
-
332
-
333
- # # Check if the file exists
334
- # if not os.path.isfile('AI_vs_AI_Ridge_2000_Samples.pkl'):
335
- # # Download the zip file if it doesn't exist
336
- # url = 'https://jaifar.net/AI_vs_AI_Ridge_2000_Samples.pkl'
337
- # headers = {
338
- # 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
339
- # }
340
-
341
- # response = requests.get(url, headers=headers)
342
-
343
- # # Save the file
344
- # with open('AI_vs_AI_Ridge_2000_Samples.pkl', 'wb') as file2:
345
- # file2.write(response.content)
346
-
347
-
348
-
349
- # # df = pd.DataFrame(columns=["paragraph"])
350
- # # df = df.append({"paragraph": input_paragraph}, ignore_index=True)
351
-
352
- # df = pd.DataFrame([input_paragraph], columns=["paragraph"])
353
-
354
-
355
-
356
- # # Variable to control number of words to retrieve
357
- # num_words = 500
358
-
359
- # # Retrieving only the first num_words words of the paragraph
360
- # input_paragraph = ' '.join(word_tokenize(input_paragraph)[:num_words])
361
-
362
-
363
- # # Extracting features
364
- # def extract_features_AI_vs_AI_RandomForest_88_Samples(text):
365
- # words = word_tokenize(text)
366
- # sentences = sent_tokenize(text)
367
-
368
- # avg_word_length = sum(len(word) for word in words if word.isalpha()) / len(words)
369
- # avg_sent_length = sum(len(sent) for sent in sentences) / len(sentences)
370
- # punctuation_count = len([char for char in text if char in '.,;:?!'])
371
- # stopword_count = len([word for word in words if word in stopwords.words('english')])
372
-
373
- # lemmatizer = WordNetLemmatizer()
374
- # lemma_count = len(set(lemmatizer.lemmatize(word) for word in words))
375
-
376
- # named_entity_count = len([chunk for chunk in ne_chunk(pos_tag(words)) if isinstance(chunk, Tree)])
377
-
378
- # tagged_words = nltk.pos_tag(words)
379
- # pos_counts = nltk.FreqDist(tag for (word, tag) in tagged_words)
380
- # pos_features = {
381
- # 'pos_IN': pos_counts['IN'],
382
- # 'pos_DT': pos_counts['DT'],
383
- # 'pos_NN': pos_counts['NN'],
384
- # 'pos_,': pos_counts[','],
385
- # 'pos_VBZ': pos_counts['VBZ'],
386
- # 'pos_WDT': pos_counts['WDT'],
387
- # 'pos_TO': pos_counts['TO'],
388
- # 'pos_VB': pos_counts['VB'],
389
- # 'pos_VBG': pos_counts['VBG'],
390
- # 'pos_.': pos_counts['.'],
391
- # 'pos_JJ': pos_counts['JJ'],
392
- # 'pos_NNS': pos_counts['NNS'],
393
- # 'pos_RB': pos_counts['RB'],
394
- # 'pos_CC': pos_counts['CC'],
395
- # 'pos_VBN': pos_counts['VBN'],
396
- # }
397
-
398
- # features = {
399
- # 'avg_word_length': avg_word_length,
400
- # 'avg_sent_length': avg_sent_length,
401
- # 'punctuation_count': punctuation_count,
402
- # 'stopword_count': stopword_count,
403
- # 'lemma_count': lemma_count,
404
- # 'named_entity_count': named_entity_count,
405
- # }
406
- # features.update(pos_features)
407
-
408
- # return pd.Series(features)
409
-
410
-
411
-
412
- # # Extracting features for AI_vs_AI_Ridge_2000_Samples
413
- # def extract_features_AI_vs_AI_Ridge_2000_Samples(text):
414
-
415
- # words = word_tokenize(text)
416
- # sentences = sent_tokenize(text)
417
-
418
- # avg_word_length = sum(len(word) for word in words if word.isalpha()) / len(words)
419
- # avg_sent_length = sum(len(sent) for sent in sentences) / len(sentences)
420
- # punctuation_count = len([char for char in text if char in '.,;:?!'])
421
- # stopword_count = len([word for word in words if word in stopwords.words('english')])
422
-
423
- # lemmatizer = WordNetLemmatizer()
424
- # lemma_count = len(set(lemmatizer.lemmatize(word) for word in words))
425
-
426
- # named_entity_count = len([chunk for chunk in ne_chunk(pos_tag(words)) if isinstance(chunk, Tree)])
427
-
428
- # tagged_words = nltk.pos_tag(words)
429
- # pos_counts = nltk.FreqDist(tag for (word, tag) in tagged_words)
430
- # pos_features = {
431
- # 'pos_IN': pos_counts['IN'],
432
- # 'pos_DT': pos_counts['DT'],
433
- # 'pos_NN': pos_counts['NN'],
434
- # 'pos_,': pos_counts[','],
435
- # 'pos_VBZ': pos_counts['VBZ'],
436
- # 'pos_WDT': pos_counts['WDT'],
437
- # 'pos_TO': pos_counts['TO'],
438
- # 'pos_VB': pos_counts['VB'],
439
- # 'pos_PRP': pos_counts['PRP'],
440
- # 'pos_VBP': pos_counts['VBP'],
441
- # 'pos_VBG': pos_counts['VBG'],
442
- # 'pos_.': pos_counts['.'],
443
- # 'pos_JJ': pos_counts['JJ'],
444
- # 'pos_NNS': pos_counts['NNS'],
445
- # 'pos_RB': pos_counts['RB'],
446
- # 'pos_PRP$': pos_counts['PRP$'],
447
- # 'pos_CC': pos_counts['CC'],
448
- # 'pos_MD': pos_counts['MD'],
449
- # 'pos_VBN': pos_counts['VBN'],
450
- # 'pos_NNP': pos_counts['NNP'],
451
- # }
452
-
453
- # features = {
454
- # 'avg_word_length': avg_word_length,
455
- # 'avg_sent_length': avg_sent_length,
456
- # 'punctuation_count': punctuation_count,
457
- # 'stopword_count': stopword_count,
458
- # 'lemma_count': lemma_count,
459
- # 'named_entity_count': named_entity_count,
460
- # }
461
- # # features.update(pos_features)
462
- # features = pd.concat([features, pd.DataFrame(pos_features, index=[0])], axis=1)
463
-
464
- # return pd.Series(features)
465
-
466
- # # Function from Code(2)
467
- # def add_vectorized_features(df):
468
- # vectorizer = CountVectorizer()
469
- # tfidf_vectorizer = TfidfVectorizer()
470
- # X_bow = vectorizer.fit_transform(df['paragraph'])
471
- # X_tfidf = tfidf_vectorizer.fit_transform(df['paragraph'])
472
- # df_bow = pd.DataFrame(X_bow.toarray(), columns=vectorizer.get_feature_names_out())
473
- # df_tfidf = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
474
- # df = pd.concat([df, df_bow, df_tfidf], axis=1)
475
- # return df
476
-
477
-
478
- # # Function define AI_vs_AI_RandomForest_88_Samples
479
- # def AI_vs_AI_RandomForest_88_Samples(df):
480
 
481
-
482
-
483
-
484
- # input_features = df['paragraph'].apply(extract_features_AI_vs_AI_RandomForest_88_Samples)
485
- # # try:
486
- # # predicted_llm = clf_loaded.predict(input_features)
487
- # # st.write(f"Predicted LLM: {predicted_llm[0]}")
488
- # # predicted_proba = clf_loaded.predict_proba(input_features)
489
- # # except Exception as e:
490
- # # st.write(f"An error occurred: {str(e)}")
491
-
492
- # # labels = clf_loaded.classes_
493
-
494
- # # # Create a mapping from old labels to new labels
495
- # # label_mapping = {1: 'gpt3', 2: 'gpt4', 3: 'googlebard', 4: 'huggingface'}
496
-
497
- # # # Apply the mapping to the labels
498
- # # new_labels = [label_mapping[label] for label in labels]
499
-
500
- # # # Create a dictionary that maps new labels to probabilities
501
- # # prob_dict = {k: v for k, v in zip(new_labels, probabilities)}
502
-
503
- # # # Convert probabilities to percentages and sort the dictionary in descending order
504
- # # prob_dict = {k: f'{v*100:.2f}%' for k, v in sorted(prob_dict.items(), key=lambda item: item[1], reverse=True)}
505
-
506
- # # # Print the dictionary
507
- # # #st.write(prob_dict)
508
-
509
- # # # Create a progress bar and a bar chart for each LLM
510
- # # for llm, prob in prob_dict.items():
511
- # # st.write(llm + ': ' + prob)
512
- # # st.progress(float(prob.strip('%'))/100)
513
- # return
514
-
515
-
516
- # def AI_vs_AI_Ridge_2000_Samples(df):
517
-
518
- # # At this point, the pickle file should exist, either it was already there, or it has been downloaded and extracted.
519
- # with open('AI_vs_AI_Ridge_2000_Samples.pkl', 'rb') as file2:
520
- # clf_loaded = pickle.load(file2)
521
-
522
 
523
- # input_features = df['paragraph'].apply(extract_features_AI_vs_AI_Ridge_2000_Samples)
524
-
525
- # # Here, input_features is a DataFrame, not a Series
526
- # input_features = pd.concat(input_features.values, ignore_index=True)
527
-
528
- # # Add new vectorized features
529
- # df = add_vectorized_features(df)
530
-
531
- # # Concatenate input_features and df along columns
532
- # final_features = pd.concat([input_features, df], axis=1)
533
-
534
- # predicted_llm = clf_loaded.predict(final_features)
535
- # st.write(f"Predicted LLM: {predicted_llm[0]}")
536
-
537
- # return
538
-
539
-
540
-
541
- # # Check if the file exists
542
- # if not os.path.isfile('RandomForestClassifier.pkl'):
543
- # # Download the zip file if it doesn't exist
544
- # url = 'https://jaifar.net/RandomForestClassifier.pkl'
545
- # headers = {
546
- # 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
547
- # }
548
-
549
- # response = requests.get(url, headers=headers)
550
-
551
- # # Save the file
552
- # try:
553
- # with open('RandomForestClassifier.pkl', 'wb') as file:
554
- # file.write(response.content)
555
- # except Exception as e:
556
- # st.write(f"An error occurred while writing RandomForestClassifier.pkl: {str(e)}")
557
-
558
- # try:
559
- # with open('RandomForestClassifier.pkl', 'rb') as file:
560
- # clf_loaded = pickle.load(file)
561
- # except Exception as e:
562
- # st.write(f"An error occurred while loading RandomForestClassifier.pkl: {str(e)}")
563
-
564
- # # Creates a button
565
- # press_me_button = st.button("Which Model Used?")
566
-
567
- # if press_me_button:
568
-
569
- # input_features = df['paragraph'].apply(extract_features_AI_vs_AI_RandomForest_88_Samples)
570
 
571
- # try:
572
- # predicted_llm = clf_loaded.predict(input_features)
573
- # st.write(f"Predicted LLM: {predicted_llm[0]}")
574
- # predicted_proba = clf_loaded.predict_proba(input_features)
575
- # except Exception as e:
576
- # st.write(f"An error occurred: {str(e)}")
577
-
578
- # # # Use the selected option to control the flow of your application
579
- # # if selected_option == "AI vs AI - RandomForest - 88 Samples":
580
- # # AI_vs_AI_RandomForest_88_Samples(df)
581
-
582
- # # elif selected_option == "AI vs AI - Ridge - 2000 Samples":
583
- # # AI_vs_AI_Ridge_2000_Samples(df)
584
-
585
- # # elif selected_option == "AI vs Human":
586
- # # st.write("You selected AI vs Human!")
587
-
588
-
589
-
590
 
 
 
 
 
 
591
 
 
 
592
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import requests
3
+ import subprocess # Import the subprocess module
4
+ from keras.models import load_model
5
+ from keras.preprocessing.text import Tokenizer
6
+ from keras.preprocessing.sequence import pad_sequences
7
+ from sklearn.preprocessing import LabelEncoder
8
+ #from nltk.tokenize import word_tokenize # Assuming you've imported this for word_tokenize
9
+ import pickle
10
  import numpy as np
11
+ import streamlit as st
 
 
 
 
 
12
 
13
+ # Custom headers for the HTTP request
14
+ headers = {
15
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
16
+ }
17
+
18
+ # Debugging: Print current working directory initially
19
+ st.write(f"Initial Current Working Directory: {os.getcwd()}")
20
+
21
+ # Check if the model folder exists
22
+ zip_file_path = "my_authorship_model_zip.zip"
23
+ if not os.path.exists('my_authorship_model'):
24
+ try:
25
+ # Download the model
26
+ model_url = 'https://jaifar.net/ADS/my_authorship_model_zip.zip'
27
+ r = requests.get(model_url, headers=headers)
28
+ r.raise_for_status()
29
+
30
+ # Debugging: Check if download is successful by examining content length
31
+ st.write(f"Downloaded model size: {len(r.content)} bytes")
32
+
33
+ # Save the downloaded content
34
+ with open(zip_file_path, "wb") as f:
35
+ f.write(r.content)
36
+
37
+ # Debugging: Verify that the zip file exists
38
+ if os.path.exists(zip_file_path):
39
+ st.write("Zip file exists")
40
+
41
+ # Debugging: List contents of the zip file using unzip
42
+ subprocess.run(['unzip', '-l', zip_file_path])
43
+
44
+ # Extract the model using unzip
45
+ unzip_result = subprocess.run(['unzip', '-o', zip_file_path, '-d', 'my_authorship_model'])
46
+
47
+ # Debugging: Check unzip exit code (0 means success)
48
+ if unzip_result.returncode == 0:
49
+ st.write("Model folder successfully extracted using unzip")
50
+ # Debugging: List the directory contents after extraction
51
+ st.write("Listing directory contents:")
52
+ st.write(os.listdir('.'))
53
+
54
+ else:
55
+ st.write("Model folder was not extracted successfully using unzip")
56
+ exit(1)
57
+ else:
58
+ st.write("Zip file does not exist")
59
+ exit(1)
60
+ except Exception as e:
61
+ st.write(f"Failed to download or extract the model: {e}")
62
+ exit(1)
63
+ else:
64
+ st.write("Model folder exists")
65
+
66
+ # Debugging: Print current working directory after extraction
67
+ st.write(f"Current Working Directory After Extraction: {os.getcwd()}")
68
+
69
+
70
+ # Debugging: Check if model folder contains required files
71
+ try:
72
+ model_files = os.listdir('my_authorship_model')
73
+ st.write(f"Files in model folder: {model_files}")
74
+ except Exception as e:
75
+ st.write(f"Could not list files in model folder: {e}")
76
+
77
+ # Download required files
78
+ file_urls = {
79
+ 'tokenizer.pkl': 'https://jaifar.net/ADS/tokenizer.pkl',
80
+ 'label_encoder.pkl': 'https://jaifar.net/ADS/label_encoder.pkl'
81
+ }
82
+
83
+ for filename, url in file_urls.items():
84
+ try:
85
+ r = requests.get(url, headers=headers)
86
+ r.raise_for_status()
87
+ with open(filename, 'wb') as f:
88
+ f.write(r.content)
89
+ except Exception as e:
90
+ st.write(f"Failed to download {filename}: {e}")
91
+ exit(1)
92
+
93
+ # Load the saved model
94
+ loaded_model = load_model("my_authorship_model")
95
+
96
+ # Load the saved tokenizer and label encoder
97
+ with open('tokenizer.pkl', 'rb') as handle:
98
+ tokenizer = pickle.load(handle)
99
+
100
+ with open('label_encoder.pkl', 'rb') as handle:
101
+ label_encoder = pickle.load(handle)
102
+
103
+ max_length = 300 # As defined in the training code
104
+
105
+ # Function to predict author for new text
106
+ def predict_author(new_text, model, tokenizer, label_encoder):
107
+ sequence = tokenizer.texts_to_sequences([new_text])
108
+ padded_sequence = pad_sequences(sequence, maxlen=max_length, padding='post', truncating='post')
109
+ prediction = model.predict(padded_sequence)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
+ predicted_label = label_encoder.inverse_transform([prediction.argmax()])[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
+ probabilities = prediction[0]
114
+ author_probabilities = {}
115
+ for idx, prob in enumerate(probabilities):
116
+ author = label_encoder.inverse_transform([idx])[0]
117
+ author_probabilities[author] = prob
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
 
119
+ return predicted_label, author_probabilities
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
 
121
+ st.markdown("CNN : version: 1.2")
122
+ new_text = st.text_area("Input your text here")
123
+ #words_counts = word_tokenize(new_text) # Changed input_paragraph to new_text
124
+ #final_words = len(words_counts)
125
+ #st.write('Words counts: ', final_words)
126
 
127
+ predicted_author, author_probabilities = predict_author(new_text, loaded_model, tokenizer, label_encoder)
128
+ sorted_probabilities = sorted(author_probabilities.items(), key=lambda x: x[1], reverse=True)
129
 
130
+ st.write(f"The text is most likely written by: {predicted_author}")
131
+ st.write("Probabilities for each author are (sorted):")
132
+ for author, prob in sorted_probabilities:
133
+ st.write(f"{author}: {prob * 100:.2f}%")