AI-Text-Detector

Sleeping

App Files Files Community

jaifar530 commited on Jun 18, 2023

Commit

9a042e3

unverified ·

1 Parent(s): f3da86a

fix spycy

Browse files

Files changed (1) hide show

app.py +23 -60

app.py CHANGED Viewed

@@ -1,90 +1,65 @@
 import streamlit as st
-#title
-st.title("Smart Detection System of AI-Generated Text Models")
-#subtitle
-st.markdown("## This is a POC repo for Smart Detection System of AI Generated Text Models project, it is a pre-trained model that detect the probablities of using any of the known LLM (chatgpt3, chatgpt4, GoogleBard, HuggingfaceChat)##")
 import os
 import requests
 import pickle
 import pandas as pd
 import nltk
-from spacy_huggingface_hub import en_core_web_sm
 from nltk.corpus import stopwords
 from nltk.tokenize import word_tokenize, sent_tokenize
 import numpy as np
 nltk.download('punkt')
 nltk.download('stopwords')
 nltk.download('averaged_perceptron_tagger')
 # Check if the file exists
 if not os.path.isfile('RandomForestClassifier.pkl'):
     # Download the zip file if it doesn't exist
     url = 'https://jaifar.net/RandomForestClassifier.pkl'
-    headers = {
-    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
-    }
     response = requests.get(url, headers=headers)
     # Save the file
     with open('RandomForestClassifier.pkl', 'wb') as file:
         file.write(response.content)
-# At this point, the pickle file should exist, either it was already there, or it has been downloaded and extracted.
 with open('RandomForestClassifier.pkl', 'rb') as file:
     clf_loaded = pickle.load(file)
-# # Loading a SpaCy model for Named Entity Recognition and Lemmatization
-# !pip install https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl
-# Using spacy.load().
-# import spacy
-nlp = spacy.load("en_core_web_sm")
-# # Importing as module.
-# import en_core_web_sm
-# nlp = en_core_web_sm.load()
-nlp = spacy.load('en_core_web_sm')
-# # Your input paragraph
-# input_paragraph = "Your paragraph here..."
-# # Read the paragraph from a text file
-# with open('paragraph.txt', 'r') as file:
-#     input_paragraph = file.read()
 input_paragraph = st.text_area("Input your text here")
 df = pd.DataFrame(columns=["paragraph"])
 df = df.append({"paragraph": input_paragraph}, ignore_index=True)
-# Variable to control number of words to retrieve
 num_words = 500
-# Retrieving only the first num_words words of the paragraph
 input_paragraph = ' '.join(word_tokenize(input_paragraph)[:num_words])
-# Extracting features
 def extract_features(text):
     words = word_tokenize(text)
     sentences = sent_tokenize(text)
-    doc = nlp(text)
     avg_word_length = sum(len(word) for word in words) / len(words)
     avg_sent_length = sum(len(sent) for sent in sentences) / len(sentences)
     punctuation_count = len([char for char in text if char in '.,;:?!'])
     stopword_count = len([word for word in words if word in stopwords.words('english')])
-    lemma_count = len(set(token.lemma_ for token in doc))
-    named_entity_count = len(doc.ents)
     tagged_words = nltk.pos_tag(words)
     pos_counts = nltk.FreqDist(tag for (word, tag) in tagged_words)
     pos_features = {
@@ -104,7 +79,6 @@ def extract_features(text):
         'pos_CC': pos_counts['CC'],
         'pos_VBN': pos_counts['VBN'],
     }
     features = {
         'avg_word_length': avg_word_length,
         'avg_sent_length': avg_sent_length,
@@ -114,22 +88,11 @@ def extract_features(text):
         'named_entity_count': named_entity_count,
     }
     features.update(pos_features)
     return pd.Series(features)
-    #return pd.DataFrame(features)
-# Creates a button named 'Press me'
 press_me_button = st.button("Press me")
 if press_me_button:
-    # Display the text entered by the user
     input_features = df['paragraph'].apply(extract_features)
     predicted_llm = clf_loaded.predict(input_features)
     st.write(f"Predicted LLM: {predicted_llm[0]}")
-# Get the features of the input paragraph
-#input_features = extract_features(input_paragraph)

 import streamlit as st
 import os
 import requests
 import pickle
 import pandas as pd
 import nltk
 from nltk.corpus import stopwords
 from nltk.tokenize import word_tokenize, sent_tokenize
+from nltk.stem import WordNetLemmatizer
 import numpy as np
 nltk.download('punkt')
 nltk.download('stopwords')
 nltk.download('averaged_perceptron_tagger')
+nltk.download('wordnet')  # needed for lemmatization
+# Setting up Hugging Face API for NER
+API_URL = "https://api-inference.huggingface.co/models/spacy/en_core_web_sm"
+headers = {"Authorization": "Bearer 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"}
+def get_entities(text):
+    data = {"inputs": text}
+    response = requests.post(API_URL, headers=headers, json=data)
+    entities = [item['entity_group'] for item in response.json()[0]]
+    return len(entities)
+# Set up lemmatizer
+lemmatizer = WordNetLemmatizer()
+#title
+st.title("Smart Detection System of AI-Generated Text Models")
+st.markdown("## This is a POC repo for Smart Detection System of AI Generated Text Models project, it is a pre-trained model that detect the probabilities of using any of the known LLM (chatgpt3, chatgpt4, GoogleBard, HuggingfaceChat)##")
 # Check if the file exists
 if not os.path.isfile('RandomForestClassifier.pkl'):
     # Download the zip file if it doesn't exist
     url = 'https://jaifar.net/RandomForestClassifier.pkl'
+    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
     response = requests.get(url, headers=headers)
     # Save the file
     with open('RandomForestClassifier.pkl', 'wb') as file:
         file.write(response.content)
 with open('RandomForestClassifier.pkl', 'rb') as file:
     clf_loaded = pickle.load(file)
 input_paragraph = st.text_area("Input your text here")
 df = pd.DataFrame(columns=["paragraph"])
 df = df.append({"paragraph": input_paragraph}, ignore_index=True)
 num_words = 500
 input_paragraph = ' '.join(word_tokenize(input_paragraph)[:num_words])
 def extract_features(text):
     words = word_tokenize(text)
     sentences = sent_tokenize(text)
     avg_word_length = sum(len(word) for word in words) / len(words)
     avg_sent_length = sum(len(sent) for sent in sentences) / len(sentences)
     punctuation_count = len([char for char in text if char in '.,;:?!'])
     stopword_count = len([word for word in words if word in stopwords.words('english')])
+    lemma_count = len(set(lemmatizer.lemmatize(word) for word in words))
+    named_entity_count = get_entities(text)
     tagged_words = nltk.pos_tag(words)
     pos_counts = nltk.FreqDist(tag for (word, tag) in tagged_words)
     pos_features = {
         'pos_CC': pos_counts['CC'],
         'pos_VBN': pos_counts['VBN'],
     }
     features = {
         'avg_word_length': avg_word_length,
         'avg_sent_length': avg_sent_length,
         'named_entity_count': named_entity_count,
     }
     features.update(pos_features)
     return pd.Series(features)
 press_me_button = st.button("Press me")
 if press_me_button:
     input_features = df['paragraph'].apply(extract_features)
     predicted_llm = clf_loaded.predict(input_features)
     st.write(f"Predicted LLM: {predicted_llm[0]}")