Spaces:
Running
Running
File size: 4,961 Bytes
b576153 ac163d9 b576153 ac163d9 fa5527f ac163d9 5edecda ac163d9 004e551 ac163d9 5edecda ac163d9 5edecda ac163d9 5edecda ac163d9 5edecda ac163d9 5006f20 ac163d9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
import os
import requests
import subprocess # Import the subprocess module
from keras.models import load_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
#from nltk.tokenize import word_tokenize # Assuming you've imported this for word_tokenize
import pickle
import numpy as np
import streamlit as st
# Custom headers for the HTTP request
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
}
# Debugging: Print current working directory initially
st.write(f"Initial Current Working Directory: {os.getcwd()}")
# Check if the model folder exists
zip_file_path = "my_authorship_model_zip.zip"
if not os.path.exists('my_authorship_model'):
try:
# Download the model
model_url = 'https://jaifar.net/ADS/my_authorship_model_zip.zip'
r = requests.get(model_url, headers=headers)
r.raise_for_status()
# Debugging: Check if download is successful by examining content length
st.write(f"Downloaded model size: {len(r.content)} bytes")
# Save the downloaded content
with open(zip_file_path, "wb") as f:
f.write(r.content)
# Debugging: Verify that the zip file exists
if os.path.exists(zip_file_path):
st.write("Zip file exists")
# Debugging: List contents of the zip file using unzip
subprocess.run(['unzip', '-l', zip_file_path])
# Extract the model using unzip
unzip_result = subprocess.run(['unzip', '-o', zip_file_path, '-d', 'my_authorship_model'])
# Debugging: Check unzip exit code (0 means success)
if unzip_result.returncode == 0:
st.write("Model folder successfully extracted using unzip")
# Debugging: List the directory contents after extraction
st.write("Listing directory contents:")
st.write(os.listdir('.'))
else:
st.write("Model folder was not extracted successfully using unzip")
exit(1)
else:
st.write("Zip file does not exist")
exit(1)
except Exception as e:
st.write(f"Failed to download or extract the model: {e}")
exit(1)
else:
st.write("Model folder exists")
# Debugging: Print current working directory after extraction
st.write(f"Current Working Directory After Extraction: {os.getcwd()}")
# Debugging: Check if model folder contains required files
try:
model_files = os.listdir('my_authorship_model')
st.write(f"Files in model folder: {model_files}")
except Exception as e:
st.write(f"Could not list files in model folder: {e}")
# Download required files
file_urls = {
'tokenizer.pkl': 'https://jaifar.net/ADS/tokenizer.pkl',
'label_encoder.pkl': 'https://jaifar.net/ADS/label_encoder.pkl'
}
for filename, url in file_urls.items():
try:
r = requests.get(url, headers=headers)
r.raise_for_status()
with open(filename, 'wb') as f:
f.write(r.content)
except Exception as e:
st.write(f"Failed to download {filename}: {e}")
exit(1)
# Load the saved model
loaded_model = load_model("my_authorship_model")
# Load the saved tokenizer and label encoder
with open('tokenizer.pkl', 'rb') as handle:
tokenizer = pickle.load(handle)
with open('label_encoder.pkl', 'rb') as handle:
label_encoder = pickle.load(handle)
max_length = 300 # As defined in the training code
# Function to predict author for new text
def predict_author(new_text, model, tokenizer, label_encoder):
sequence = tokenizer.texts_to_sequences([new_text])
padded_sequence = pad_sequences(sequence, maxlen=max_length, padding='post', truncating='post')
prediction = model.predict(padded_sequence)
predicted_label = label_encoder.inverse_transform([prediction.argmax()])[0]
probabilities = prediction[0]
author_probabilities = {}
for idx, prob in enumerate(probabilities):
author = label_encoder.inverse_transform([idx])[0]
author_probabilities[author] = prob
return predicted_label, author_probabilities
st.markdown("CNN : version: 1.2")
new_text = st.text_area("Input your text here")
#words_counts = word_tokenize(new_text) # Changed input_paragraph to new_text
#final_words = len(words_counts)
#st.write('Words counts: ', final_words)
predicted_author, author_probabilities = predict_author(new_text, loaded_model, tokenizer, label_encoder)
sorted_probabilities = sorted(author_probabilities.items(), key=lambda x: x[1], reverse=True)
st.write(f"The text is most likely written by: {predicted_author}")
st.write("Probabilities for each author are (sorted):")
for author, prob in sorted_probabilities:
st.write(f"{author}: {prob * 100:.2f}%")
|