commited on
Browse files
@@ -1,592 +1,133 @@
1 |
2 |
3 |
import streamlit as st
4 |
5 |
st.markdown("version: 2.0")
6 |
7 |
st.title("Smart Detection System of AI-Generated Text Models")
8 |
9 |
10 |
st.markdown("This is a POC for Smart Detection System of AI Generated Text Models project (:blue[MSc Data Analytics]), it is a pre-trained model that detect the probablities of using any of the known LLM (chatgpt3, chatgpt4, GoogleBard, HuggingfaceChat)")
11 |
12 |
import pickle
13 |
import pandas as pd
14 |
from sklearn.feature_extraction.text import CountVectorizer
15 |
from sklearn.linear_model import RidgeClassifier
16 |
import os
17 |
import requests
18 |
19 |
import numpy as np
20 |
21 |
22 |
23 |
24 |
# Check if the file exists
25 |
if not os.path.isfile('ridge_100%_BOW_ngram_full_text.pkl'):
26 |
27 |
28 |
29 |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
# from nltk.tree import Tree
125 |
# from joblib import dump, load
126 |
127 |
128 |
129 |
130 |
# #######
131 |
132 |
133 |
134 |
135 |
# # Check if the file exists
136 |
# if not os.path.isfile('RandomForestClassifier.joblib'):
137 |
138 |
# url = ''
139 |
# headers = {
140 |
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
141 |
# }
142 |
143 |
# response = requests.get(url, headers=headers)
144 |
145 |
146 |
# with open('RandomForestClassifier.joblib', 'wb') as file:
147 |
# file.write(response.content)
148 |
149 |
150 |
# # Load the model from the file
151 |
# clf_loaded = load('RandomForestClassifier.joblib')
152 |
153 |
# # # At this point, the pickle file should exist, either it was already there, or it has been downloaded and extracted.
154 |
# # with open('RandomForestClassifier.pkl', 'rb') as file:
155 |
# # clf_loaded = pickle.load(file)
156 |
157 |
# input_paragraph = st.text_area("Input your text here")
158 |
# words_counts = word_tokenize(input_paragraph)
159 |
# final_words = len(words_counts)
160 |
# st.write('Words counts: ', final_words)
161 |
162 |
163 |
164 |
# # df = pd.DataFrame(columns=["paragraph"])
165 |
# # df = df.append({"paragraph": input_paragraph}, ignore_index=True)
166 |
167 |
# df = pd.DataFrame([input_paragraph], columns=["paragraph"])
168 |
169 |
170 |
171 |
# # Variable to control number of words to retrieve
172 |
# num_words = 500
173 |
174 |
# # Retrieving only the first num_words words of the paragraph
175 |
# input_paragraph = ' '.join(word_tokenize(input_paragraph)[:num_words])
176 |
177 |
# # Extracting features
178 |
# def extract_features(text):
179 |
# words = word_tokenize(text)
180 |
# sentences = sent_tokenize(text)
181 |
182 |
# avg_word_length = sum(len(word) for word in words if word.isalpha()) / len(words)
183 |
# avg_sent_length = sum(len(sent) for sent in sentences) / len(sentences)
184 |
# punctuation_count = len([char for char in text if char in '.,;:?!'])
185 |
# stopword_count = len([word for word in words if word in stopwords.words('english')])
186 |
187 |
# lemmatizer = WordNetLemmatizer()
188 |
# lemma_count = len(set(lemmatizer.lemmatize(word) for word in words))
189 |
190 |
# named_entity_count = len([chunk for chunk in ne_chunk(pos_tag(words)) if isinstance(chunk, Tree)])
191 |
192 |
# tagged_words = nltk.pos_tag(words)
193 |
# pos_counts = nltk.FreqDist(tag for (word, tag) in tagged_words)
194 |
# pos_features = {
195 |
# 'pos_IN': pos_counts['IN'],
196 |
# 'pos_DT': pos_counts['DT'],
197 |
# 'pos_NN': pos_counts['NN'],
198 |
# 'pos_,': pos_counts[','],
199 |
# 'pos_VBZ': pos_counts['VBZ'],
200 |
# 'pos_WDT': pos_counts['WDT'],
201 |
# 'pos_TO': pos_counts['TO'],
202 |
# 'pos_VB': pos_counts['VB'],
203 |
# 'pos_VBG': pos_counts['VBG'],
204 |
# 'pos_.': pos_counts['.'],
205 |
# 'pos_JJ': pos_counts['JJ'],
206 |
# 'pos_NNS': pos_counts['NNS'],
207 |
# 'pos_RB': pos_counts['RB'],
208 |
# 'pos_CC': pos_counts['CC'],
209 |
# 'pos_VBN': pos_counts['VBN'],
210 |
# }
211 |
212 |
# features = {
213 |
# 'avg_word_length': avg_word_length,
214 |
# 'avg_sent_length': avg_sent_length,
215 |
# 'punctuation_count': punctuation_count,
216 |
# 'stopword_count': stopword_count,
217 |
# 'lemma_count': lemma_count,
218 |
# 'named_entity_count': named_entity_count,
219 |
# }
220 |
# features.update(pos_features)
221 |
222 |
# return pd.Series(features)
223 |
224 |
225 |
# # Creates a button named 'Press me'
226 |
# press_me_button = st.button("Which Model Used?")
227 |
228 |
# if press_me_button:
229 |
# input_features = df['paragraph'].apply(extract_features)
230 |
# predicted_llm = clf_loaded.predict(input_features)
231 |
# #st.write(f"Predicted LLM: {predicted_llm[0]}")
232 |
233 |
# predicted_proba = clf_loaded.predict_proba(input_features)
234 |
# probabilities = predicted_proba[0]
235 |
# labels = clf_loaded.classes_
236 |
237 |
# # Create a mapping from old labels to new labels
238 |
# label_mapping = {1: 'gpt3', 2: 'gpt4', 3: 'googlebard', 4: 'huggingface'}
239 |
240 |
# # Apply the mapping to the labels
241 |
# new_labels = [label_mapping[label] for label in labels]
242 |
243 |
# # Create a dictionary that maps new labels to probabilities
244 |
# prob_dict = {k: v for k, v in zip(new_labels, probabilities)}
245 |
246 |
# # Convert probabilities to percentages and sort the dictionary in descending order
247 |
# prob_dict = {k: f'{v*100:.2f}%' for k, v in sorted(prob_dict.items(), key=lambda item: item[1], reverse=True)}
248 |
249 |
# # Print the dictionary
250 |
# #st.write(prob_dict)
251 |
252 |
# # Create a progress bar and a bar chart for each LLM
253 |
# for llm, prob in prob_dict.items():
254 |
# st.write(llm + ': ' + prob)
255 |
# st.progress(float(prob.strip('%'))/100)
256 |
257 |
258 |
259 |
260 |
# import streamlit as st
261 |
# import os
262 |
# import requests
263 |
# import pickle
264 |
# import pandas as pd
265 |
# import nltk
266 |
# import spacy
267 |
# from nltk.corpus import stopwords
268 |
# from nltk.tokenize import word_tokenize, sent_tokenize
269 |
# import numpy as np
270 |
# ############
271 |
# from nltk.stem import WordNetLemmatizer
272 |
# from nltk import ne_chunk, pos_tag, word_tokenize
273 |
# from nltk.tree import Tree
274 |
# from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
275 |
276 |
277 |
278 |
279 |
# #######
280 |
281 |
282 |
283 |
284 |
# #version
285 |
# st.markdown("v1.9")
286 |
287 |
288 |
# # URL of the text file
289 |
# url = ''
290 |
291 |
# headers = {
292 |
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
293 |
# }
294 |
295 |
# response = requests.get(url, headers=headers)
296 |
297 |
# # Check if the request was successful
298 |
# if response.status_code == 200:
299 |
# # Read the content of the file
300 |
# content = response.text
301 |
302 |
# # Print the content of the file
303 |
# # print(content)
304 |
# else:
305 |
# # Handle the case when the request fails
306 |
# print('Failed to download the file.')
307 |
308 |
309 |
310 |
# #title
311 |
# st.title("Smart Detection System of AI-Generated Text Models")
312 |
313 |
# #subtitle
314 |
# st.markdown("This is a POC for Smart Detection System of AI Generated Text Models project (:blue[MSc Data Analytics]), it is a pre-trained model that detect the probablities of using any of the known LLM (chatgpt3, chatgpt4, GoogleBard, HuggingfaceChat)")
315 |
316 |
# #input text
317 |
# input_paragraph = st.text_area("Input your text here")
318 |
# words_counts = word_tokenize(input_paragraph)
319 |
# final_words = len(words_counts)
320 |
# st.write('Words counts: ', final_words)
321 |
322 |
# # Define your options
323 |
# options = ["AI vs AI - RandomForest - 88 Samples", "AI vs AI - Ridge - 2000 Samples", "AI vs Human"]
324 |
325 |
# # Create a dropdown menu with "Option 2" as the default
326 |
# # selected_option = st.selectbox('Select an Option', options, index=1)
327 |
# selected_option = st.selectbox('Select an Option', options)
328 |
329 |
330 |
331 |
332 |
333 |
# # Check if the file exists
334 |
# if not os.path.isfile('AI_vs_AI_Ridge_2000_Samples.pkl'):
335 |
# # Download the zip file if it doesn't exist
336 |
# url = ''
337 |
# headers = {
338 |
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
339 |
# }
340 |
341 |
# response = requests.get(url, headers=headers)
342 |
343 |
# # Save the file
344 |
# with open('AI_vs_AI_Ridge_2000_Samples.pkl', 'wb') as file2:
345 |
# file2.write(response.content)
346 |
347 |
348 |
349 |
# # df = pd.DataFrame(columns=["paragraph"])
350 |
# # df = df.append({"paragraph": input_paragraph}, ignore_index=True)
351 |
352 |
# df = pd.DataFrame([input_paragraph], columns=["paragraph"])
353 |
354 |
355 |
356 |
# # Variable to control number of words to retrieve
357 |
# num_words = 500
358 |
359 |
# # Retrieving only the first num_words words of the paragraph
360 |
# input_paragraph = ' '.join(word_tokenize(input_paragraph)[:num_words])
361 |
362 |
363 |
# # Extracting features
364 |
# def extract_features_AI_vs_AI_RandomForest_88_Samples(text):
365 |
# words = word_tokenize(text)
366 |
# sentences = sent_tokenize(text)
367 |
368 |
# avg_word_length = sum(len(word) for word in words if word.isalpha()) / len(words)
369 |
# avg_sent_length = sum(len(sent) for sent in sentences) / len(sentences)
370 |
# punctuation_count = len([char for char in text if char in '.,;:?!'])
371 |
# stopword_count = len([word for word in words if word in stopwords.words('english')])
372 |
373 |
# lemmatizer = WordNetLemmatizer()
374 |
# lemma_count = len(set(lemmatizer.lemmatize(word) for word in words))
375 |
376 |
# named_entity_count = len([chunk for chunk in ne_chunk(pos_tag(words)) if isinstance(chunk, Tree)])
377 |
378 |
# tagged_words = nltk.pos_tag(words)
379 |
# pos_counts = nltk.FreqDist(tag for (word, tag) in tagged_words)
380 |
# pos_features = {
381 |
# 'pos_IN': pos_counts['IN'],
382 |
# 'pos_DT': pos_counts['DT'],
383 |
# 'pos_NN': pos_counts['NN'],
384 |
# 'pos_,': pos_counts[','],
385 |
# 'pos_VBZ': pos_counts['VBZ'],
386 |
# 'pos_WDT': pos_counts['WDT'],
387 |
# 'pos_TO': pos_counts['TO'],
388 |
# 'pos_VB': pos_counts['VB'],
389 |
# 'pos_VBG': pos_counts['VBG'],
390 |
# 'pos_.': pos_counts['.'],
391 |
# 'pos_JJ': pos_counts['JJ'],
392 |
# 'pos_NNS': pos_counts['NNS'],
393 |
# 'pos_RB': pos_counts['RB'],
394 |
# 'pos_CC': pos_counts['CC'],
395 |
# 'pos_VBN': pos_counts['VBN'],
396 |
# }
397 |
398 |
# features = {
399 |
# 'avg_word_length': avg_word_length,
400 |
# 'avg_sent_length': avg_sent_length,
401 |
# 'punctuation_count': punctuation_count,
402 |
# 'stopword_count': stopword_count,
403 |
# 'lemma_count': lemma_count,
404 |
# 'named_entity_count': named_entity_count,
405 |
# }
406 |
# features.update(pos_features)
407 |
408 |
# return pd.Series(features)
409 |
410 |
411 |
412 |
# # Extracting features for AI_vs_AI_Ridge_2000_Samples
413 |
# def extract_features_AI_vs_AI_Ridge_2000_Samples(text):
414 |
415 |
# words = word_tokenize(text)
416 |
# sentences = sent_tokenize(text)
417 |
418 |
# avg_word_length = sum(len(word) for word in words if word.isalpha()) / len(words)
419 |
# avg_sent_length = sum(len(sent) for sent in sentences) / len(sentences)
420 |
# punctuation_count = len([char for char in text if char in '.,;:?!'])
421 |
# stopword_count = len([word for word in words if word in stopwords.words('english')])
422 |
423 |
# lemmatizer = WordNetLemmatizer()
424 |
# lemma_count = len(set(lemmatizer.lemmatize(word) for word in words))
425 |
426 |
# named_entity_count = len([chunk for chunk in ne_chunk(pos_tag(words)) if isinstance(chunk, Tree)])
427 |
428 |
# tagged_words = nltk.pos_tag(words)
429 |
# pos_counts = nltk.FreqDist(tag for (word, tag) in tagged_words)
430 |
# pos_features = {
431 |
# 'pos_IN': pos_counts['IN'],
432 |
# 'pos_DT': pos_counts['DT'],
433 |
# 'pos_NN': pos_counts['NN'],
434 |
# 'pos_,': pos_counts[','],
435 |
# 'pos_VBZ': pos_counts['VBZ'],
436 |
# 'pos_WDT': pos_counts['WDT'],
437 |
# 'pos_TO': pos_counts['TO'],
438 |
# 'pos_VB': pos_counts['VB'],
439 |
# 'pos_PRP': pos_counts['PRP'],
440 |
# 'pos_VBP': pos_counts['VBP'],
441 |
# 'pos_VBG': pos_counts['VBG'],
442 |
# 'pos_.': pos_counts['.'],
443 |
# 'pos_JJ': pos_counts['JJ'],
444 |
# 'pos_NNS': pos_counts['NNS'],
445 |
# 'pos_RB': pos_counts['RB'],
446 |
# 'pos_PRP$': pos_counts['PRP$'],
447 |
# 'pos_CC': pos_counts['CC'],
448 |
# 'pos_MD': pos_counts['MD'],
449 |
# 'pos_VBN': pos_counts['VBN'],
450 |
# 'pos_NNP': pos_counts['NNP'],
451 |
# }
452 |
453 |
# features = {
454 |
# 'avg_word_length': avg_word_length,
455 |
# 'avg_sent_length': avg_sent_length,
456 |
# 'punctuation_count': punctuation_count,
457 |
# 'stopword_count': stopword_count,
458 |
# 'lemma_count': lemma_count,
459 |
# 'named_entity_count': named_entity_count,
460 |
# }
461 |
# # features.update(pos_features)
462 |
# features = pd.concat([features, pd.DataFrame(pos_features, index=[0])], axis=1)
463 |
464 |
# return pd.Series(features)
465 |
466 |
# # Function from Code(2)
467 |
# def add_vectorized_features(df):
468 |
# vectorizer = CountVectorizer()
469 |
# tfidf_vectorizer = TfidfVectorizer()
470 |
# X_bow = vectorizer.fit_transform(df['paragraph'])
471 |
# X_tfidf = tfidf_vectorizer.fit_transform(df['paragraph'])
472 |
# df_bow = pd.DataFrame(X_bow.toarray(), columns=vectorizer.get_feature_names_out())
473 |
# df_tfidf = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
474 |
# df = pd.concat([df, df_bow, df_tfidf], axis=1)
475 |
# return df
476 |
477 |
478 |
# # Function define AI_vs_AI_RandomForest_88_Samples
479 |
# def AI_vs_AI_RandomForest_88_Samples(df):
480 |
481 |
482 |
483 |
484 |
# input_features = df['paragraph'].apply(extract_features_AI_vs_AI_RandomForest_88_Samples)
485 |
# # try:
486 |
# # predicted_llm = clf_loaded.predict(input_features)
487 |
# # st.write(f"Predicted LLM: {predicted_llm[0]}")
488 |
# # predicted_proba = clf_loaded.predict_proba(input_features)
489 |
# # except Exception as e:
490 |
# # st.write(f"An error occurred: {str(e)}")
491 |
492 |
# # labels = clf_loaded.classes_
493 |
494 |
# # # Create a mapping from old labels to new labels
495 |
# # label_mapping = {1: 'gpt3', 2: 'gpt4', 3: 'googlebard', 4: 'huggingface'}
496 |
497 |
# # # Apply the mapping to the labels
498 |
# # new_labels = [label_mapping[label] for label in labels]
499 |
500 |
# # # Create a dictionary that maps new labels to probabilities
501 |
# # prob_dict = {k: v for k, v in zip(new_labels, probabilities)}
502 |
503 |
# # # Convert probabilities to percentages and sort the dictionary in descending order
504 |
# # prob_dict = {k: f'{v*100:.2f}%' for k, v in sorted(prob_dict.items(), key=lambda item: item[1], reverse=True)}
505 |
506 |
# # # Print the dictionary
507 |
# # #st.write(prob_dict)
508 |
509 |
# # # Create a progress bar and a bar chart for each LLM
510 |
# # for llm, prob in prob_dict.items():
511 |
# # st.write(llm + ': ' + prob)
512 |
# # st.progress(float(prob.strip('%'))/100)
513 |
# return
514 |
515 |
516 |
# def AI_vs_AI_Ridge_2000_Samples(df):
517 |
518 |
# # At this point, the pickle file should exist, either it was already there, or it has been downloaded and extracted.
519 |
# with open('AI_vs_AI_Ridge_2000_Samples.pkl', 'rb') as file2:
520 |
# clf_loaded = pickle.load(file2)
521 |
522 |
523 |
524 |
525 |
526 |
527 |
528 |
# # Add new vectorized features
529 |
# df = add_vectorized_features(df)
530 |
531 |
# # Concatenate input_features and df along columns
532 |
# final_features = pd.concat([input_features, df], axis=1)
533 |
534 |
# predicted_llm = clf_loaded.predict(final_features)
535 |
# st.write(f"Predicted LLM: {predicted_llm[0]}")
536 |
537 |
# return
538 |
539 |
540 |
541 |
# # Check if the file exists
542 |
# if not os.path.isfile('RandomForestClassifier.pkl'):
543 |
# # Download the zip file if it doesn't exist
544 |
# url = ''
545 |
# headers = {
546 |
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
547 |
# }
548 |
549 |
# response = requests.get(url, headers=headers)
550 |
551 |
# # Save the file
552 |
# try:
553 |
# with open('RandomForestClassifier.pkl', 'wb') as file:
554 |
# file.write(response.content)
555 |
# except Exception as e:
556 |
# st.write(f"An error occurred while writing RandomForestClassifier.pkl: {str(e)}")
557 |
558 |
# try:
559 |
# with open('RandomForestClassifier.pkl', 'rb') as file:
560 |
# clf_loaded = pickle.load(file)
561 |
# except Exception as e:
562 |
# st.write(f"An error occurred while loading RandomForestClassifier.pkl: {str(e)}")
563 |
564 |
# # Creates a button
565 |
# press_me_button = st.button("Which Model Used?")
566 |
567 |
# if press_me_button:
568 |
569 |
# input_features = df['paragraph'].apply(extract_features_AI_vs_AI_RandomForest_88_Samples)
570 |
571 |
572 |
# predicted_llm = clf_loaded.predict(input_features)
573 |
# st.write(f"Predicted LLM: {predicted_llm[0]}")
574 |
# predicted_proba = clf_loaded.predict_proba(input_features)
575 |
# except Exception as e:
576 |
# st.write(f"An error occurred: {str(e)}")
577 |
578 |
# # # Use the selected option to control the flow of your application
579 |
# # if selected_option == "AI vs AI - RandomForest - 88 Samples":
580 |
# # AI_vs_AI_RandomForest_88_Samples(df)
581 |
582 |
# # elif selected_option == "AI vs AI - Ridge - 2000 Samples":
583 |
# # AI_vs_AI_Ridge_2000_Samples(df)
584 |
585 |
# # elif selected_option == "AI vs Human":
586 |
# # st.write("You selected AI vs Human!")
587 |
588 |
589 |
590 |
591 |
592 |
1 |
import os
2 |
import requests
3 |
import subprocess # Import the subprocess module
4 |
from keras.models import load_model
5 |
from keras.preprocessing.text import Tokenizer
6 |
from keras.preprocessing.sequence import pad_sequences
7 |
from sklearn.preprocessing import LabelEncoder
8 |
#from nltk.tokenize import word_tokenize # Assuming you've imported this for word_tokenize
9 |
import pickle
10 |
import numpy as np
11 |
import streamlit as st
12 |
13 |
# Custom headers for the HTTP request
14 |
headers = {
15 |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
16 |
17 |
18 |
# Debugging: Print current working directory initially
19 |
st.write(f"Initial Current Working Directory: {os.getcwd()}")
20 |
21 |
# Check if the model folder exists
22 |
zip_file_path = ""
23 |
if not os.path.exists('my_authorship_model'):
24 |
25 |
# Download the model
26 |
model_url = ''
27 |
r = requests.get(model_url, headers=headers)
28 |
29 |
30 |
# Debugging: Check if download is successful by examining content length
31 |
st.write(f"Downloaded model size: {len(r.content)} bytes")
32 |
33 |
# Save the downloaded content
34 |
with open(zip_file_path, "wb") as f:
35 |
36 |
37 |
# Debugging: Verify that the zip file exists
38 |
if os.path.exists(zip_file_path):
39 |
st.write("Zip file exists")
40 |
41 |
# Debugging: List contents of the zip file using unzip
42 |
+['unzip', '-l', zip_file_path])
43 |
44 |
# Extract the model using unzip
45 |
unzip_result =['unzip', '-o', zip_file_path, '-d', 'my_authorship_model'])
46 |
47 |
# Debugging: Check unzip exit code (0 means success)
48 |
if unzip_result.returncode == 0:
49 |
st.write("Model folder successfully extracted using unzip")
50 |
# Debugging: List the directory contents after extraction
51 |
st.write("Listing directory contents:")
52 |
53 |
54 |
55 |
st.write("Model folder was not extracted successfully using unzip")
56 |
57 |
58 |
st.write("Zip file does not exist")
59 |
60 |
except Exception as e:
61 |
st.write(f"Failed to download or extract the model: {e}")
62 |
63 |
64 |
st.write("Model folder exists")
65 |
66 |
# Debugging: Print current working directory after extraction
67 |
st.write(f"Current Working Directory After Extraction: {os.getcwd()}")
68 |
69 |
70 |
# Debugging: Check if model folder contains required files
71 |
72 |
model_files = os.listdir('my_authorship_model')
73 |
st.write(f"Files in model folder: {model_files}")
74 |
except Exception as e:
75 |
st.write(f"Could not list files in model folder: {e}")
76 |
77 |
# Download required files
78 |
file_urls = {
79 |
'tokenizer.pkl': '',
80 |
'label_encoder.pkl': ''
81 |
82 |
83 |
for filename, url in file_urls.items():
84 |
85 |
r = requests.get(url, headers=headers)
86 |
87 |
with open(filename, 'wb') as f:
88 |
89 |
except Exception as e:
90 |
st.write(f"Failed to download {filename}: {e}")
91 |
92 |
93 |
# Load the saved model
94 |
loaded_model = load_model("my_authorship_model")
95 |
96 |
# Load the saved tokenizer and label encoder
97 |
with open('tokenizer.pkl', 'rb') as handle:
98 |
tokenizer = pickle.load(handle)
99 |
100 |
with open('label_encoder.pkl', 'rb') as handle:
101 |
label_encoder = pickle.load(handle)
102 |
103 |
max_length = 300 # As defined in the training code
104 |
105 |
# Function to predict author for new text
106 |
def predict_author(new_text, model, tokenizer, label_encoder):
107 |
sequence = tokenizer.texts_to_sequences([new_text])
108 |
padded_sequence = pad_sequences(sequence, maxlen=max_length, padding='post', truncating='post')
109 |
prediction = model.predict(padded_sequence)
110 |
111 |
predicted_label = label_encoder.inverse_transform([prediction.argmax()])[0]
112 |
113 |
probabilities = prediction[0]
114 |
author_probabilities = {}
115 |
for idx, prob in enumerate(probabilities):
116 |
author = label_encoder.inverse_transform([idx])[0]
117 |
author_probabilities[author] = prob
118 |
119 |
return predicted_label, author_probabilities
120 |
121 |
st.markdown("CNN : version: 1.2")
122 |
new_text = st.text_area("Input your text here")
123 |
#words_counts = word_tokenize(new_text) # Changed input_paragraph to new_text
124 |
#final_words = len(words_counts)
125 |
#st.write('Words counts: ', final_words)
126 |
127 |
predicted_author, author_probabilities = predict_author(new_text, loaded_model, tokenizer, label_encoder)
128 |
sorted_probabilities = sorted(author_probabilities.items(), key=lambda x: x[1], reverse=True)
129 |
130 |
st.write(f"The text is most likely written by: {predicted_author}")
131 |
st.write("Probabilities for each author are (sorted):")
132 |
for author, prob in sorted_probabilities:
133 |
st.write(f"{author}: {prob * 100:.2f}%")