jaifar530 commited on
Commit
9a042e3
·
unverified ·
1 Parent(s): f3da86a
Files changed (1) hide show
  1. app.py +23 -60
app.py CHANGED
@@ -1,90 +1,65 @@
1
  import streamlit as st
2
-
3
- #title
4
- st.title("Smart Detection System of AI-Generated Text Models")
5
-
6
- #subtitle
7
- st.markdown("## This is a POC repo for Smart Detection System of AI Generated Text Models project, it is a pre-trained model that detect the probablities of using any of the known LLM (chatgpt3, chatgpt4, GoogleBard, HuggingfaceChat)##")
8
-
9
  import os
10
  import requests
11
  import pickle
12
  import pandas as pd
13
  import nltk
14
- from spacy_huggingface_hub import en_core_web_sm
15
  from nltk.corpus import stopwords
16
  from nltk.tokenize import word_tokenize, sent_tokenize
 
17
  import numpy as np
 
18
  nltk.download('punkt')
19
  nltk.download('stopwords')
20
  nltk.download('averaged_perceptron_tagger')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
  # Check if the file exists
23
  if not os.path.isfile('RandomForestClassifier.pkl'):
24
  # Download the zip file if it doesn't exist
25
  url = 'https://jaifar.net/RandomForestClassifier.pkl'
26
- headers = {
27
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
28
- }
29
-
30
  response = requests.get(url, headers=headers)
31
-
32
  # Save the file
33
  with open('RandomForestClassifier.pkl', 'wb') as file:
34
  file.write(response.content)
35
 
36
- # At this point, the pickle file should exist, either it was already there, or it has been downloaded and extracted.
37
  with open('RandomForestClassifier.pkl', 'rb') as file:
38
  clf_loaded = pickle.load(file)
39
 
40
-
41
-
42
- # # Loading a SpaCy model for Named Entity Recognition and Lemmatization
43
- # !pip install https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl
44
-
45
- # Using spacy.load().
46
- # import spacy
47
- nlp = spacy.load("en_core_web_sm")
48
-
49
- # # Importing as module.
50
- # import en_core_web_sm
51
- # nlp = en_core_web_sm.load()
52
-
53
- nlp = spacy.load('en_core_web_sm')
54
-
55
- # # Your input paragraph
56
- # input_paragraph = "Your paragraph here..."
57
-
58
- # # Read the paragraph from a text file
59
- # with open('paragraph.txt', 'r') as file:
60
- # input_paragraph = file.read()
61
-
62
  input_paragraph = st.text_area("Input your text here")
63
-
64
  df = pd.DataFrame(columns=["paragraph"])
65
  df = df.append({"paragraph": input_paragraph}, ignore_index=True)
66
 
67
-
68
-
69
- # Variable to control number of words to retrieve
70
  num_words = 500
71
-
72
- # Retrieving only the first num_words words of the paragraph
73
  input_paragraph = ' '.join(word_tokenize(input_paragraph)[:num_words])
74
 
75
- # Extracting features
76
  def extract_features(text):
77
  words = word_tokenize(text)
78
  sentences = sent_tokenize(text)
79
- doc = nlp(text)
80
-
81
  avg_word_length = sum(len(word) for word in words) / len(words)
82
  avg_sent_length = sum(len(sent) for sent in sentences) / len(sentences)
83
  punctuation_count = len([char for char in text if char in '.,;:?!'])
84
  stopword_count = len([word for word in words if word in stopwords.words('english')])
85
- lemma_count = len(set(token.lemma_ for token in doc))
86
- named_entity_count = len(doc.ents)
87
-
88
  tagged_words = nltk.pos_tag(words)
89
  pos_counts = nltk.FreqDist(tag for (word, tag) in tagged_words)
90
  pos_features = {
@@ -104,7 +79,6 @@ def extract_features(text):
104
  'pos_CC': pos_counts['CC'],
105
  'pos_VBN': pos_counts['VBN'],
106
  }
107
-
108
  features = {
109
  'avg_word_length': avg_word_length,
110
  'avg_sent_length': avg_sent_length,
@@ -114,22 +88,11 @@ def extract_features(text):
114
  'named_entity_count': named_entity_count,
115
  }
116
  features.update(pos_features)
117
-
118
  return pd.Series(features)
119
- #return pd.DataFrame(features)
120
 
121
-
122
- # Creates a button named 'Press me'
123
  press_me_button = st.button("Press me")
124
 
125
  if press_me_button:
126
- # Display the text entered by the user
127
-
128
  input_features = df['paragraph'].apply(extract_features)
129
  predicted_llm = clf_loaded.predict(input_features)
130
  st.write(f"Predicted LLM: {predicted_llm[0]}")
131
-
132
- # Get the features of the input paragraph
133
- #input_features = extract_features(input_paragraph)
134
-
135
-
 
1
  import streamlit as st
 
 
 
 
 
 
 
2
  import os
3
  import requests
4
  import pickle
5
  import pandas as pd
6
  import nltk
 
7
  from nltk.corpus import stopwords
8
  from nltk.tokenize import word_tokenize, sent_tokenize
9
+ from nltk.stem import WordNetLemmatizer
10
  import numpy as np
11
+
12
  nltk.download('punkt')
13
  nltk.download('stopwords')
14
  nltk.download('averaged_perceptron_tagger')
15
+ nltk.download('wordnet') # needed for lemmatization
16
+
17
+ # Setting up Hugging Face API for NER
18
+ API_URL = "https://api-inference.huggingface.co/models/spacy/en_core_web_sm"
19
+ headers = {"Authorization": "Bearer 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"}
20
+
21
+ def get_entities(text):
22
+ data = {"inputs": text}
23
+ response = requests.post(API_URL, headers=headers, json=data)
24
+ entities = [item['entity_group'] for item in response.json()[0]]
25
+ return len(entities)
26
+
27
+ # Set up lemmatizer
28
+ lemmatizer = WordNetLemmatizer()
29
+
30
+ #title
31
+ st.title("Smart Detection System of AI-Generated Text Models")
32
+ st.markdown("## This is a POC repo for Smart Detection System of AI Generated Text Models project, it is a pre-trained model that detect the probabilities of using any of the known LLM (chatgpt3, chatgpt4, GoogleBard, HuggingfaceChat)##")
33
 
34
  # Check if the file exists
35
  if not os.path.isfile('RandomForestClassifier.pkl'):
36
  # Download the zip file if it doesn't exist
37
  url = 'https://jaifar.net/RandomForestClassifier.pkl'
38
+ headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
 
 
 
39
  response = requests.get(url, headers=headers)
 
40
  # Save the file
41
  with open('RandomForestClassifier.pkl', 'wb') as file:
42
  file.write(response.content)
43
 
 
44
  with open('RandomForestClassifier.pkl', 'rb') as file:
45
  clf_loaded = pickle.load(file)
46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  input_paragraph = st.text_area("Input your text here")
 
48
  df = pd.DataFrame(columns=["paragraph"])
49
  df = df.append({"paragraph": input_paragraph}, ignore_index=True)
50
 
 
 
 
51
  num_words = 500
 
 
52
  input_paragraph = ' '.join(word_tokenize(input_paragraph)[:num_words])
53
 
 
54
  def extract_features(text):
55
  words = word_tokenize(text)
56
  sentences = sent_tokenize(text)
 
 
57
  avg_word_length = sum(len(word) for word in words) / len(words)
58
  avg_sent_length = sum(len(sent) for sent in sentences) / len(sentences)
59
  punctuation_count = len([char for char in text if char in '.,;:?!'])
60
  stopword_count = len([word for word in words if word in stopwords.words('english')])
61
+ lemma_count = len(set(lemmatizer.lemmatize(word) for word in words))
62
+ named_entity_count = get_entities(text)
 
63
  tagged_words = nltk.pos_tag(words)
64
  pos_counts = nltk.FreqDist(tag for (word, tag) in tagged_words)
65
  pos_features = {
 
79
  'pos_CC': pos_counts['CC'],
80
  'pos_VBN': pos_counts['VBN'],
81
  }
 
82
  features = {
83
  'avg_word_length': avg_word_length,
84
  'avg_sent_length': avg_sent_length,
 
88
  'named_entity_count': named_entity_count,
89
  }
90
  features.update(pos_features)
 
91
  return pd.Series(features)
 
92
 
 
 
93
  press_me_button = st.button("Press me")
94
 
95
  if press_me_button:
 
 
96
  input_features = df['paragraph'].apply(extract_features)
97
  predicted_llm = clf_loaded.predict(input_features)
98
  st.write(f"Predicted LLM: {predicted_llm[0]}")