jaifar530 commited on
Commit
10c6916
·
unverified ·
1 Parent(s): 309842b
Files changed (1) hide show
  1. app.py +55 -28
app.py CHANGED
@@ -1,71 +1,94 @@
1
  import streamlit as st
 
 
 
 
 
 
 
2
  import os
3
  import requests
4
  import pickle
5
  import pandas as pd
6
  import nltk
 
7
  from nltk.corpus import stopwords
8
  from nltk.tokenize import word_tokenize, sent_tokenize
9
- from nltk.stem import WordNetLemmatizer
10
  import numpy as np
 
 
 
 
 
 
 
11
 
 
12
  nltk.download('punkt')
13
  nltk.download('stopwords')
14
  nltk.download('averaged_perceptron_tagger')
15
- nltk.download('wordnet') # needed for lemmatization
16
-
17
- # Setting up Hugging Face API for NER
18
- API_URL = "https://api-inference.huggingface.co/models/spacy/en_core_web_sm"
19
- headers = {"Authorization": "Bearer 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"}
20
-
21
- def get_entities(text):
22
- data = {"inputs": text}
23
- response = requests.post(API_URL, headers=headers, json=data)
24
- try:
25
- entities = [item['entity_group'] for item in response.json()[0]]
26
- except Exception as e:
27
- print("Error:", e)
28
- print("Response:", response.content)
29
- entities = []
30
- return len(entities)
31
-
32
- # Set up lemmatizer
33
- lemmatizer = WordNetLemmatizer()
34
-
35
- #title
36
- st.title("Smart Detection System of AI-Generated Text Models")
37
- st.markdown("## This is a POC repo for Smart Detection System of AI Generated Text Models project, it is a pre-trained model that detect the probabilities of using any of the known LLM (chatgpt3, chatgpt4, GoogleBard, HuggingfaceChat)##")
38
 
39
  # Check if the file exists
40
  if not os.path.isfile('RandomForestClassifier.pkl'):
41
  # Download the zip file if it doesn't exist
42
  url = 'https://jaifar.net/RandomForestClassifier.pkl'
43
- headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
 
 
 
44
  response = requests.get(url, headers=headers)
 
45
  # Save the file
46
  with open('RandomForestClassifier.pkl', 'wb') as file:
47
  file.write(response.content)
48
 
 
49
  with open('RandomForestClassifier.pkl', 'rb') as file:
50
  clf_loaded = pickle.load(file)
51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  input_paragraph = st.text_area("Input your text here")
 
53
  df = pd.DataFrame(columns=["paragraph"])
54
  df = df.append({"paragraph": input_paragraph}, ignore_index=True)
55
 
 
 
 
56
  num_words = 500
 
 
57
  input_paragraph = ' '.join(word_tokenize(input_paragraph)[:num_words])
58
 
 
59
  def extract_features(text):
60
  words = word_tokenize(text)
61
  sentences = sent_tokenize(text)
 
62
  avg_word_length = sum(len(word) for word in words) / len(words)
63
  avg_sent_length = sum(len(sent) for sent in sentences) / len(sentences)
64
  punctuation_count = len([char for char in text if char in '.,;:?!'])
65
  stopword_count = len([word for word in words if word in stopwords.words('english')])
 
 
66
  lemma_count = len(set(lemmatizer.lemmatize(word) for word in words))
67
- named_entity_count = get_entities(text)
68
- st.write(named_entity_count)
 
69
  tagged_words = nltk.pos_tag(words)
70
  pos_counts = nltk.FreqDist(tag for (word, tag) in tagged_words)
71
  pos_features = {
@@ -85,6 +108,7 @@ def extract_features(text):
85
  'pos_CC': pos_counts['CC'],
86
  'pos_VBN': pos_counts['VBN'],
87
  }
 
88
  features = {
89
  'avg_word_length': avg_word_length,
90
  'avg_sent_length': avg_sent_length,
@@ -94,8 +118,11 @@ def extract_features(text):
94
  'named_entity_count': named_entity_count,
95
  }
96
  features.update(pos_features)
 
97
  return pd.Series(features)
98
 
 
 
99
  press_me_button = st.button("Press me")
100
 
101
  if press_me_button:
@@ -117,4 +144,4 @@ if press_me_button:
117
  prob_dict = dict(zip(new_labels, probabilities))
118
 
119
  # Print the dictionary
120
- st.write(prob_dict)
 
1
  import streamlit as st
2
+
3
+ #title
4
+ st.title("Smart Detection System of AI-Generated Text Models")
5
+
6
+ #subtitle
7
+ st.markdown("## This is a POC repo for Smart Detection System of AI Generated Text Models project, it is a pre-trained model that detect the probablities of using any of the known LLM (chatgpt3, chatgpt4, GoogleBard, HuggingfaceChat)##")
8
+
9
  import os
10
  import requests
11
  import pickle
12
  import pandas as pd
13
  import nltk
14
+ import spacy
15
  from nltk.corpus import stopwords
16
  from nltk.tokenize import word_tokenize, sent_tokenize
 
17
  import numpy as np
18
+ ############
19
+ from nltk.stem import WordNetLemmatizer
20
+ from nltk import ne_chunk, pos_tag, word_tokenize
21
+ from nltk.tree import Tree
22
+ nltk.download('wordnet')
23
+ nltk.download('maxent_ne_chunker')
24
+ nltk.download('words')
25
 
26
+ #######
27
  nltk.download('punkt')
28
  nltk.download('stopwords')
29
  nltk.download('averaged_perceptron_tagger')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
  # Check if the file exists
32
  if not os.path.isfile('RandomForestClassifier.pkl'):
33
  # Download the zip file if it doesn't exist
34
  url = 'https://jaifar.net/RandomForestClassifier.pkl'
35
+ headers = {
36
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
37
+ }
38
+
39
  response = requests.get(url, headers=headers)
40
+
41
  # Save the file
42
  with open('RandomForestClassifier.pkl', 'wb') as file:
43
  file.write(response.content)
44
 
45
+ # At this point, the pickle file should exist, either it was already there, or it has been downloaded and extracted.
46
  with open('RandomForestClassifier.pkl', 'rb') as file:
47
  clf_loaded = pickle.load(file)
48
 
49
+
50
+
51
+ # # Loading a SpaCy model for Named Entity Recognition and Lemmatization
52
+ # !pip install https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl
53
+
54
+ # Using spacy.load().
55
+ nlp = spacy.load('en_core_web_sm')
56
+
57
+ # # Your input paragraph
58
+ # input_paragraph = "Your paragraph here..."
59
+
60
+ # # Read the paragraph from a text file
61
+ # with open('paragraph.txt', 'r') as file:
62
+ # input_paragraph = file.read()
63
+
64
  input_paragraph = st.text_area("Input your text here")
65
+
66
  df = pd.DataFrame(columns=["paragraph"])
67
  df = df.append({"paragraph": input_paragraph}, ignore_index=True)
68
 
69
+
70
+
71
+ # Variable to control number of words to retrieve
72
  num_words = 500
73
+
74
+ # Retrieving only the first num_words words of the paragraph
75
  input_paragraph = ' '.join(word_tokenize(input_paragraph)[:num_words])
76
 
77
+ # Extracting features
78
  def extract_features(text):
79
  words = word_tokenize(text)
80
  sentences = sent_tokenize(text)
81
+
82
  avg_word_length = sum(len(word) for word in words) / len(words)
83
  avg_sent_length = sum(len(sent) for sent in sentences) / len(sentences)
84
  punctuation_count = len([char for char in text if char in '.,;:?!'])
85
  stopword_count = len([word for word in words if word in stopwords.words('english')])
86
+
87
+ lemmatizer = WordNetLemmatizer()
88
  lemma_count = len(set(lemmatizer.lemmatize(word) for word in words))
89
+
90
+ named_entity_count = len([chunk for chunk in ne_chunk(pos_tag(words)) if isinstance(chunk, Tree)])
91
+
92
  tagged_words = nltk.pos_tag(words)
93
  pos_counts = nltk.FreqDist(tag for (word, tag) in tagged_words)
94
  pos_features = {
 
108
  'pos_CC': pos_counts['CC'],
109
  'pos_VBN': pos_counts['VBN'],
110
  }
111
+
112
  features = {
113
  'avg_word_length': avg_word_length,
114
  'avg_sent_length': avg_sent_length,
 
118
  'named_entity_count': named_entity_count,
119
  }
120
  features.update(pos_features)
121
+
122
  return pd.Series(features)
123
 
124
+
125
+ # Creates a button named 'Press me'
126
  press_me_button = st.button("Press me")
127
 
128
  if press_me_button:
 
144
  prob_dict = dict(zip(new_labels, probabilities))
145
 
146
  # Print the dictionary
147
+ st.write(prob_dict)