AI-Text-Detector

Sleeping

App Files Files Community

jaifar530 commited on Sep 16, 2023

Commit

ac163d9

unverified ·

1 Parent(s): e5f5906

Update app.py

Browse files

Files changed (1) hide show

app.py +122 -581

app.py CHANGED Viewed

@@ -1,592 +1,133 @@
-import streamlit as st
-#subtitle
-st.markdown("version: 2.0")
-#title
-st.title("Smart Detection System of AI-Generated Text Models")
-#subtitle
-st.markdown("This is a POC for Smart Detection System of AI Generated Text Models project (:blue[MSc Data Analytics]), it is a pre-trained model that detect the probablities of using any of the known LLM (chatgpt3, chatgpt4, GoogleBard, HuggingfaceChat)")
-import pickle
-import pandas as pd
-from sklearn.feature_extraction.text import CountVectorizer
-from sklearn.linear_model import RidgeClassifier
 import os
 import requests
 import numpy as np
-############
-# Check if the file exists
-if not os.path.isfile('ridge_100%_BOW_ngram_full_text.pkl'):
-    url = 'https://jaifar.net/ridge_100%_BOW_ngram_full_text.pkl'
-    headers = {
     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
-    }
-    response = requests.get(url, headers=headers)
-    with open('ridge_100%_BOW_ngram_full_text.pkl', 'wb') as file:
-        file.write(response.content)
-# At this point, the pickle file should exist, either it was already there, or it has been downloaded and extracted.
-with open('ridge_100%_BOW_ngram_full_text.pkl', 'rb') as file:
-    clf_loaded = pickle.load(file)
-input_paragraph = st.text_area("Input your text here")
-words_counts = word_tokenize(input_paragraph)
-final_words = len(words_counts)
-st.write('Words counts: ', final_words)
-# Creates a button named 'Press me'
-press_me_button = st.button("Which Model Used?")
-df = pd.DataFrame([input_paragraph], columns=["paragraph"])
-# Extracting features
-def extract_features(text):
-    vectorizer = CountVectorizer(ngram_range=(1, 2))
-    # Convert the paragraphs into a matrix of token counts
-    X_vect = vectorizer.fit_transform(text)
-    # Get the feature names
-    feature_names = vectorizer.get_feature_names_out()
-    # Convert the matrix to a DataFrame
-    X_df = pd.DataFrame(X_vect.toarray(), columns=feature_names)
-    return pd.Series(X_df)
-if press_me_button:
-    input_features = df['paragraph'].apply(extract_features)
-    predicted_llm = clf_loaded.predict(input_features)
-    st.write(f"Predicted LLM: {predicted_llm[0]}")
-    # predicted_proba = clf_loaded.predict_proba(input_features)
-    # probabilities = predicted_proba[0]
-    # labels = clf_loaded.classes_
-    # # Create a mapping from old labels to new labels
-    # label_mapping = {1: 'gpt3', 2: 'gpt4', 3: 'googlebard', 4: 'huggingface'}
-    # # Apply the mapping to the labels
-    # new_labels = [label_mapping[label] for label in labels]
-    # # Create a dictionary that maps new labels to probabilities
-    # prob_dict = {k: v for k, v in zip(new_labels, probabilities)}
-    # # Convert probabilities to percentages and sort the dictionary in descending order
-    # prob_dict = {k: f'{v*100:.2f}%' for k, v in sorted(prob_dict.items(), key=lambda item: item[1], reverse=True)}
-    # # Print the dictionary
-    # #st.write(prob_dict)
-    # # Create a progress bar and a bar chart for each LLM
-    # for llm, prob in prob_dict.items():
-    #     st.write(llm + ': ' + prob)
-    #     st.progress(float(prob.strip('%'))/100)
-#####################################################################
-# import streamlit as st
-# #subtitle
-# st.markdown("version: 1.2")
-# #title
-# st.title("Smart Detection System of AI-Generated Text Models")
-# #subtitle
-# st.markdown("This is a POC for Smart Detection System of AI Generated Text Models project (:blue[MSc Data Analytics]), it is a pre-trained model that detect the probablities of using any of the known LLM (chatgpt3, chatgpt4, GoogleBard, HuggingfaceChat)")
-# import os
-# import requests
-# # import pickle
-# import pandas as pd
-# import nltk
-# import spacy
-# from nltk.corpus import stopwords
-# from nltk.tokenize import word_tokenize, sent_tokenize
-# import numpy as np
-# ############
-# from nltk.stem import WordNetLemmatizer
-# from nltk import ne_chunk, pos_tag, word_tokenize
-# from nltk.tree import Tree
-# from joblib import dump, load
-# nltk.download('wordnet')
-# nltk.download('maxent_ne_chunker')
-# nltk.download('words')
-# #######
-# nltk.download('punkt')
-# nltk.download('stopwords')
-# nltk.download('averaged_perceptron_tagger')
-# # Check if the file exists
-# if not os.path.isfile('RandomForestClassifier.joblib'):
-#     url = 'https://jaifar.net/RandomForestClassifier.joblib'
-#     headers = {
-#     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
-#     }
-#     response = requests.get(url, headers=headers)
-#     with open('RandomForestClassifier.joblib', 'wb') as file:
-#         file.write(response.content)
-# # Load the model from the file
-# clf_loaded = load('RandomForestClassifier.joblib')
-# # # At this point, the pickle file should exist, either it was already there, or it has been downloaded and extracted.
-# # with open('RandomForestClassifier.pkl', 'rb') as file:
-# #     clf_loaded = pickle.load(file)
-# input_paragraph = st.text_area("Input your text here")
-# words_counts = word_tokenize(input_paragraph)
-# final_words = len(words_counts)
-# st.write('Words counts: ', final_words)
-# # df = pd.DataFrame(columns=["paragraph"])
-# # df = df.append({"paragraph": input_paragraph}, ignore_index=True)
-# df = pd.DataFrame([input_paragraph], columns=["paragraph"])
-# # Variable to control number of words to retrieve
-# num_words = 500
-# # Retrieving only the first num_words words of the paragraph
-# input_paragraph = ' '.join(word_tokenize(input_paragraph)[:num_words])
-# # Extracting features
-# def extract_features(text):
-#     words = word_tokenize(text)
-#     sentences = sent_tokenize(text)
-#     avg_word_length = sum(len(word) for word in words if word.isalpha()) / len(words)
-#     avg_sent_length = sum(len(sent) for sent in sentences) / len(sentences)
-#     punctuation_count = len([char for char in text if char in '.,;:?!'])
-#     stopword_count = len([word for word in words if word in stopwords.words('english')])
-#     lemmatizer = WordNetLemmatizer()
-#     lemma_count = len(set(lemmatizer.lemmatize(word) for word in words))
-#     named_entity_count = len([chunk for chunk in ne_chunk(pos_tag(words)) if isinstance(chunk, Tree)])
-#     tagged_words = nltk.pos_tag(words)
-#     pos_counts = nltk.FreqDist(tag for (word, tag) in tagged_words)
-#     pos_features = {
-#         'pos_IN': pos_counts['IN'],
-#         'pos_DT': pos_counts['DT'],
-#         'pos_NN': pos_counts['NN'],
-#         'pos_,': pos_counts[','],
-#         'pos_VBZ': pos_counts['VBZ'],
-#         'pos_WDT': pos_counts['WDT'],
-#         'pos_TO': pos_counts['TO'],
-#         'pos_VB': pos_counts['VB'],
-#         'pos_VBG': pos_counts['VBG'],
-#         'pos_.': pos_counts['.'],
-#         'pos_JJ': pos_counts['JJ'],
-#         'pos_NNS': pos_counts['NNS'],
-#         'pos_RB': pos_counts['RB'],
-#         'pos_CC': pos_counts['CC'],
-#         'pos_VBN': pos_counts['VBN'],
-#     }
-#     features = {
-#         'avg_word_length': avg_word_length,
-#         'avg_sent_length': avg_sent_length,
-#         'punctuation_count': punctuation_count,
-#         'stopword_count': stopword_count,
-#         'lemma_count': lemma_count,
-#         'named_entity_count': named_entity_count,
-#     }
-#     features.update(pos_features)
-#     return pd.Series(features)
-# # Creates a button named 'Press me'
-# press_me_button = st.button("Which Model Used?")
-# if press_me_button:
-#     input_features = df['paragraph'].apply(extract_features)
-#     predicted_llm = clf_loaded.predict(input_features)
-#     #st.write(f"Predicted LLM: {predicted_llm[0]}")
-#     predicted_proba = clf_loaded.predict_proba(input_features)
-#     probabilities = predicted_proba[0]
-#     labels = clf_loaded.classes_
-#     # Create a mapping from old labels to new labels
-#     label_mapping = {1: 'gpt3', 2: 'gpt4', 3: 'googlebard', 4: 'huggingface'}
-#     # Apply the mapping to the labels
-#     new_labels = [label_mapping[label] for label in labels]
-#     # Create a dictionary that maps new labels to probabilities
-#     prob_dict = {k: v for k, v in zip(new_labels, probabilities)}
-#     # Convert probabilities to percentages and sort the dictionary in descending order
-#     prob_dict = {k: f'{v*100:.2f}%' for k, v in sorted(prob_dict.items(), key=lambda item: item[1], reverse=True)}
-#     # Print the dictionary
-#     #st.write(prob_dict)
-#     # Create a progress bar and a bar chart for each LLM
-#     for llm, prob in prob_dict.items():
-#         st.write(llm + ': ' + prob)
-#         st.progress(float(prob.strip('%'))/100)
-############################################################
-# import streamlit as st
-# import os
-# import requests
-# import pickle
-# import pandas as pd
-# import nltk
-# import spacy
-# from nltk.corpus import stopwords
-# from nltk.tokenize import word_tokenize, sent_tokenize
-# import numpy as np
-# ############
-# from nltk.stem import WordNetLemmatizer
-# from nltk import ne_chunk, pos_tag, word_tokenize
-# from nltk.tree import Tree
-# from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
-# nltk.download('wordnet')
-# nltk.download('maxent_ne_chunker')
-# nltk.download('words')
-# #######
-# nltk.download('punkt')
-# nltk.download('stopwords')
-# nltk.download('averaged_perceptron_tagger')
-# #version
-# st.markdown("v1.9")
-# # URL of the text file
-# url = 'https://jaifar.net/text.txt'
-# headers = {
-# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
-# }
-# response = requests.get(url, headers=headers)
-# # Check if the request was successful
-# if response.status_code == 200:
-#     # Read the content of the file
-#     content = response.text
-#     # Print the content of the file
-#     # print(content)
-# else:
-#     # Handle the case when the request fails
-#     print('Failed to download the file.')
-# #title
-# st.title("Smart Detection System of AI-Generated Text Models")
-# #subtitle
-# st.markdown("This is a POC for Smart Detection System of AI Generated Text Models project (:blue[MSc Data Analytics]), it is a pre-trained model that detect the probablities of using any of the known LLM (chatgpt3, chatgpt4, GoogleBard, HuggingfaceChat)")
-# #input text
-# input_paragraph = st.text_area("Input your text here")
-# words_counts = word_tokenize(input_paragraph)
-# final_words = len(words_counts)
-# st.write('Words counts: ', final_words)
-# # Define your options
-# options = ["AI vs AI - RandomForest - 88 Samples", "AI vs AI - Ridge - 2000 Samples", "AI vs Human"]
-# # Create a dropdown menu with "Option 2" as the default
-# # selected_option = st.selectbox('Select an Option', options, index=1)
-# selected_option = st.selectbox('Select an Option', options)
-# # Check if the file exists
-# if not os.path.isfile('AI_vs_AI_Ridge_2000_Samples.pkl'):
-#     # Download the zip file if it doesn't exist
-#     url = 'https://jaifar.net/AI_vs_AI_Ridge_2000_Samples.pkl'
-#     headers = {
-#     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
-#     }
-#     response = requests.get(url, headers=headers)
-#     # Save the file
-#     with open('AI_vs_AI_Ridge_2000_Samples.pkl', 'wb') as file2:
-#         file2.write(response.content)
-# # df = pd.DataFrame(columns=["paragraph"])
-# # df = df.append({"paragraph": input_paragraph}, ignore_index=True)
-# df = pd.DataFrame([input_paragraph], columns=["paragraph"])
-# # Variable to control number of words to retrieve
-# num_words = 500
-# # Retrieving only the first num_words words of the paragraph
-# input_paragraph = ' '.join(word_tokenize(input_paragraph)[:num_words])
-# # Extracting features
-# def extract_features_AI_vs_AI_RandomForest_88_Samples(text):
-#     words = word_tokenize(text)
-#     sentences = sent_tokenize(text)
-#     avg_word_length = sum(len(word) for word in words if word.isalpha()) / len(words)
-#     avg_sent_length = sum(len(sent) for sent in sentences) / len(sentences)
-#     punctuation_count = len([char for char in text if char in '.,;:?!'])
-#     stopword_count = len([word for word in words if word in stopwords.words('english')])
-#     lemmatizer = WordNetLemmatizer()
-#     lemma_count = len(set(lemmatizer.lemmatize(word) for word in words))
-#     named_entity_count = len([chunk for chunk in ne_chunk(pos_tag(words)) if isinstance(chunk, Tree)])
-#     tagged_words = nltk.pos_tag(words)
-#     pos_counts = nltk.FreqDist(tag for (word, tag) in tagged_words)
-#     pos_features = {
-#         'pos_IN': pos_counts['IN'],
-#         'pos_DT': pos_counts['DT'],
-#         'pos_NN': pos_counts['NN'],
-#         'pos_,': pos_counts[','],
-#         'pos_VBZ': pos_counts['VBZ'],
-#         'pos_WDT': pos_counts['WDT'],
-#         'pos_TO': pos_counts['TO'],
-#         'pos_VB': pos_counts['VB'],
-#         'pos_VBG': pos_counts['VBG'],
-#         'pos_.': pos_counts['.'],
-#         'pos_JJ': pos_counts['JJ'],
-#         'pos_NNS': pos_counts['NNS'],
-#         'pos_RB': pos_counts['RB'],
-#         'pos_CC': pos_counts['CC'],
-#         'pos_VBN': pos_counts['VBN'],
-#     }
-#     features = {
-#         'avg_word_length': avg_word_length,
-#         'avg_sent_length': avg_sent_length,
-#         'punctuation_count': punctuation_count,
-#         'stopword_count': stopword_count,
-#         'lemma_count': lemma_count,
-#         'named_entity_count': named_entity_count,
-#     }
-#     features.update(pos_features)
-#     return pd.Series(features)
-# # Extracting features for AI_vs_AI_Ridge_2000_Samples
-# def extract_features_AI_vs_AI_Ridge_2000_Samples(text):
-#     words = word_tokenize(text)
-#     sentences = sent_tokenize(text)
-#     avg_word_length = sum(len(word) for word in words if word.isalpha()) / len(words)
-#     avg_sent_length = sum(len(sent) for sent in sentences) / len(sentences)
-#     punctuation_count = len([char for char in text if char in '.,;:?!'])
-#     stopword_count = len([word for word in words if word in stopwords.words('english')])
-#     lemmatizer = WordNetLemmatizer()
-#     lemma_count = len(set(lemmatizer.lemmatize(word) for word in words))
-#     named_entity_count = len([chunk for chunk in ne_chunk(pos_tag(words)) if isinstance(chunk, Tree)])
-#     tagged_words = nltk.pos_tag(words)
-#     pos_counts = nltk.FreqDist(tag for (word, tag) in tagged_words)
-#     pos_features = {
-#         'pos_IN': pos_counts['IN'],
-#         'pos_DT': pos_counts['DT'],
-#         'pos_NN': pos_counts['NN'],
-#         'pos_,': pos_counts[','],
-#         'pos_VBZ': pos_counts['VBZ'],
-#         'pos_WDT': pos_counts['WDT'],
-#         'pos_TO': pos_counts['TO'],
-#         'pos_VB': pos_counts['VB'],
-#         'pos_PRP': pos_counts['PRP'],
-#         'pos_VBP': pos_counts['VBP'],
-#         'pos_VBG': pos_counts['VBG'],
-#         'pos_.': pos_counts['.'],
-#         'pos_JJ': pos_counts['JJ'],
-#         'pos_NNS': pos_counts['NNS'],
-#         'pos_RB': pos_counts['RB'],
-#         'pos_PRP$': pos_counts['PRP$'],
-#         'pos_CC': pos_counts['CC'],
-#         'pos_MD': pos_counts['MD'],
-#         'pos_VBN': pos_counts['VBN'],
-#         'pos_NNP': pos_counts['NNP'],
-#     }
-#     features = {
-#         'avg_word_length': avg_word_length,
-#         'avg_sent_length': avg_sent_length,
-#         'punctuation_count': punctuation_count,
-#         'stopword_count': stopword_count,
-#         'lemma_count': lemma_count,
-#         'named_entity_count': named_entity_count,
-#     }
-#     # features.update(pos_features)
-#     features = pd.concat([features, pd.DataFrame(pos_features, index=[0])], axis=1)
-#     return pd.Series(features)
-# # Function from Code(2)
-# def add_vectorized_features(df):
-#     vectorizer = CountVectorizer()
-#     tfidf_vectorizer = TfidfVectorizer()
-#     X_bow = vectorizer.fit_transform(df['paragraph'])
-#     X_tfidf = tfidf_vectorizer.fit_transform(df['paragraph'])
-#     df_bow = pd.DataFrame(X_bow.toarray(), columns=vectorizer.get_feature_names_out())
-#     df_tfidf = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
-#     df = pd.concat([df, df_bow, df_tfidf], axis=1)
-#     return df
-# # Function define AI_vs_AI_RandomForest_88_Samples
-# def AI_vs_AI_RandomForest_88_Samples(df):
-#     input_features = df['paragraph'].apply(extract_features_AI_vs_AI_RandomForest_88_Samples)
-#     # try:
-#     #     predicted_llm = clf_loaded.predict(input_features)
-#     #     st.write(f"Predicted LLM: {predicted_llm[0]}")
-#     #     predicted_proba = clf_loaded.predict_proba(input_features)
-#     # except Exception as e:
-#     #     st.write(f"An error occurred: {str(e)}")
-#     # labels = clf_loaded.classes_
-#     # # Create a mapping from old labels to new labels
-#     # label_mapping = {1: 'gpt3', 2: 'gpt4', 3: 'googlebard', 4: 'huggingface'}
-#     # # Apply the mapping to the labels
-#     # new_labels = [label_mapping[label] for label in labels]
-#     # # Create a dictionary that maps new labels to probabilities
-#     # prob_dict = {k: v for k, v in zip(new_labels, probabilities)}
-#     # # Convert probabilities to percentages and sort the dictionary in descending order
-#     # prob_dict = {k: f'{v*100:.2f}%' for k, v in sorted(prob_dict.items(), key=lambda item: item[1], reverse=True)}
-#     # # Print the dictionary
-#     # #st.write(prob_dict)
-#     # # Create a progress bar and a bar chart for each LLM
-#     # for llm, prob in prob_dict.items():
-#     #     st.write(llm + ': ' + prob)
-#     #     st.progress(float(prob.strip('%'))/100)
-#     return
-# def AI_vs_AI_Ridge_2000_Samples(df):
-#     # At this point, the pickle file should exist, either it was already there, or it has been downloaded and extracted.
-#     with open('AI_vs_AI_Ridge_2000_Samples.pkl', 'rb') as file2:
-#         clf_loaded = pickle.load(file2)
-#     input_features = df['paragraph'].apply(extract_features_AI_vs_AI_Ridge_2000_Samples)
-#     # Here, input_features is a DataFrame, not a Series
-#     input_features = pd.concat(input_features.values, ignore_index=True)
-#     # Add new vectorized features
-#     df = add_vectorized_features(df)
-#     # Concatenate input_features and df along columns
-#     final_features = pd.concat([input_features, df], axis=1)
-#     predicted_llm = clf_loaded.predict(final_features)
-#     st.write(f"Predicted LLM: {predicted_llm[0]}")
-#     return
-# # Check if the file exists
-# if not os.path.isfile('RandomForestClassifier.pkl'):
-# # Download the zip file if it doesn't exist
-#     url = 'https://jaifar.net/RandomForestClassifier.pkl'
-#     headers = {
-#         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
-#     }
-#     response = requests.get(url, headers=headers)
-#     # Save the file
-#     try:
-#         with open('RandomForestClassifier.pkl', 'wb') as file:
-#             file.write(response.content)
-#     except Exception as e:
-#         st.write(f"An error occurred while writing RandomForestClassifier.pkl: {str(e)}")
-# try:
-#     with open('RandomForestClassifier.pkl', 'rb') as file:
-#         clf_loaded = pickle.load(file)
-# except Exception as e:
-#     st.write(f"An error occurred while loading RandomForestClassifier.pkl: {str(e)}")
-# # Creates a button
-# press_me_button = st.button("Which Model Used?")
-# if press_me_button:
-#     input_features = df['paragraph'].apply(extract_features_AI_vs_AI_RandomForest_88_Samples)
-#     try:
-#         predicted_llm = clf_loaded.predict(input_features)
-#         st.write(f"Predicted LLM: {predicted_llm[0]}")
-#         predicted_proba = clf_loaded.predict_proba(input_features)
-#     except Exception as e:
-#         st.write(f"An error occurred: {str(e)}")
-#     # # Use the selected option to control the flow of your application
-#     # if selected_option == "AI vs AI - RandomForest - 88 Samples":
-#     #     AI_vs_AI_RandomForest_88_Samples(df)
-#     # elif selected_option == "AI vs AI - Ridge - 2000 Samples":
-#     #     AI_vs_AI_Ridge_2000_Samples(df)
-#     # elif selected_option == "AI vs Human":
-#     #     st.write("You selected AI vs Human!")

 import os
 import requests
+import subprocess  # Import the subprocess module
+from keras.models import load_model
+from keras.preprocessing.text import Tokenizer
+from keras.preprocessing.sequence import pad_sequences
+from sklearn.preprocessing import LabelEncoder
+#from nltk.tokenize import word_tokenize  # Assuming you've imported this for word_tokenize
+import pickle
 import numpy as np
+import streamlit as st
+# Custom headers for the HTTP request
+headers = {
     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
+}
+# Debugging: Print current working directory initially
+st.write(f"Initial Current Working Directory: {os.getcwd()}")
+# Check if the model folder exists
+zip_file_path = "my_authorship_model_zip.zip"
+if not os.path.exists('my_authorship_model'):
+    try:
+        # Download the model
+        model_url = 'https://jaifar.net/ADS/my_authorship_model_zip.zip'
+        r = requests.get(model_url, headers=headers)
+        r.raise_for_status()
+        # Debugging: Check if download is successful by examining content length
+        st.write(f"Downloaded model size: {len(r.content)} bytes")
+        # Save the downloaded content
+        with open(zip_file_path, "wb") as f:
+            f.write(r.content)
+        # Debugging: Verify that the zip file exists
+        if os.path.exists(zip_file_path):
+            st.write("Zip file exists")
+            # Debugging: List contents of the zip file using unzip
+            subprocess.run(['unzip', '-l', zip_file_path])
+            # Extract the model using unzip
+            unzip_result = subprocess.run(['unzip', '-o', zip_file_path, '-d', 'my_authorship_model'])
+            # Debugging: Check unzip exit code (0 means success)
+            if unzip_result.returncode == 0:
+                st.write("Model folder successfully extracted using unzip")
+                # Debugging: List the directory contents after extraction
+                st.write("Listing directory contents:")
+                st.write(os.listdir('.'))
+            else:
+                st.write("Model folder was not extracted successfully using unzip")
+                exit(1)
+        else:
+            st.write("Zip file does not exist")
+            exit(1)
+    except Exception as e:
+        st.write(f"Failed to download or extract the model: {e}")
+        exit(1)
+else:
+    st.write("Model folder exists")
+# Debugging: Print current working directory after extraction
+st.write(f"Current Working Directory After Extraction: {os.getcwd()}")
+# Debugging: Check if model folder contains required files
+try:
+    model_files = os.listdir('my_authorship_model')
+    st.write(f"Files in model folder: {model_files}")
+except Exception as e:
+    st.write(f"Could not list files in model folder: {e}")
+# Download required files
+file_urls = {
+    'tokenizer.pkl': 'https://jaifar.net/ADS/tokenizer.pkl',
+    'label_encoder.pkl': 'https://jaifar.net/ADS/label_encoder.pkl'
+}
+for filename, url in file_urls.items():
+    try:
+        r = requests.get(url, headers=headers)
+        r.raise_for_status()
+        with open(filename, 'wb') as f:
+            f.write(r.content)
+    except Exception as e:
+        st.write(f"Failed to download {filename}: {e}")
+        exit(1)
+# Load the saved model
+loaded_model = load_model("my_authorship_model")
+# Load the saved tokenizer and label encoder
+with open('tokenizer.pkl', 'rb') as handle:
+    tokenizer = pickle.load(handle)
+with open('label_encoder.pkl', 'rb') as handle:
+    label_encoder = pickle.load(handle)
+max_length = 300  # As defined in the training code
+# Function to predict author for new text
+def predict_author(new_text, model, tokenizer, label_encoder):
+    sequence = tokenizer.texts_to_sequences([new_text])
+    padded_sequence = pad_sequences(sequence, maxlen=max_length, padding='post', truncating='post')
+    prediction = model.predict(padded_sequence)
+    predicted_label = label_encoder.inverse_transform([prediction.argmax()])[0]
+    probabilities = prediction[0]
+    author_probabilities = {}
+    for idx, prob in enumerate(probabilities):
+        author = label_encoder.inverse_transform([idx])[0]
+        author_probabilities[author] = prob
+    return predicted_label, author_probabilities
+st.markdown("CNN : version: 1.2")
+new_text = st.text_area("Input your text here")
+#words_counts = word_tokenize(new_text)  # Changed input_paragraph to new_text
+#final_words = len(words_counts)
+#st.write('Words counts: ', final_words)
+predicted_author, author_probabilities = predict_author(new_text, loaded_model, tokenizer, label_encoder)
+sorted_probabilities = sorted(author_probabilities.items(), key=lambda x: x[1], reverse=True)
+st.write(f"The text is most likely written by: {predicted_author}")
+st.write("Probabilities for each author are (sorted):")
+for author, prob in sorted_probabilities:
+    st.write(f"{author}: {prob * 100:.2f}%")