File size: 4,456 Bytes
03d9d99
b576153
 
 
 
 
 
 
9a042e3
b576153
9a042e3
b576153
 
 
9a042e3
 
 
 
 
 
 
 
 
865019b
 
 
 
 
 
9a042e3
 
 
 
 
 
 
 
b576153
 
 
 
 
9a042e3
b576153
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9a042e3
 
b576153
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dbff27c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import streamlit as st
import os
import requests
import pickle
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
import numpy as np

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')  # needed for lemmatization

# Setting up Hugging Face API for NER
API_URL = "https://api-inference.huggingface.co/models/spacy/en_core_web_sm"
headers = {"Authorization": "Bearer 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"}

def get_entities(text):
    data = {"inputs": text}
    response = requests.post(API_URL, headers=headers, json=data)
    try:
        entities = [item['entity_group'] for item in response.json()[0]]
    except Exception as e:
        print("Error:", e)
        print("Response:", response.content)
        entities = []
    return len(entities)

# Set up lemmatizer
lemmatizer = WordNetLemmatizer()

#title 
st.title("Smart Detection System of AI-Generated Text Models")
st.markdown("## This is a POC repo for Smart Detection System of AI Generated Text Models project, it is a pre-trained model that detect the probabilities of using any of the known LLM (chatgpt3, chatgpt4, GoogleBard, HuggingfaceChat)##")

# Check if the file exists
if not os.path.isfile('RandomForestClassifier.pkl'):
    # Download the zip file if it doesn't exist
    url = 'https://jaifar.net/RandomForestClassifier.pkl'
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
    response = requests.get(url, headers=headers)
    # Save the file
    with open('RandomForestClassifier.pkl', 'wb') as file:
        file.write(response.content)

with open('RandomForestClassifier.pkl', 'rb') as file:
    clf_loaded = pickle.load(file)

input_paragraph = st.text_area("Input your text here")
df = pd.DataFrame(columns=["paragraph"])
df = df.append({"paragraph": input_paragraph}, ignore_index=True)

num_words = 500
input_paragraph = ' '.join(word_tokenize(input_paragraph)[:num_words])

def extract_features(text):
    words = word_tokenize(text)
    sentences = sent_tokenize(text)
    avg_word_length = sum(len(word) for word in words) / len(words)
    avg_sent_length = sum(len(sent) for sent in sentences) / len(sentences)
    punctuation_count = len([char for char in text if char in '.,;:?!'])
    stopword_count = len([word for word in words if word in stopwords.words('english')])
    lemma_count = len(set(lemmatizer.lemmatize(word) for word in words))
    named_entity_count = get_entities(text)
    tagged_words = nltk.pos_tag(words)
    pos_counts = nltk.FreqDist(tag for (word, tag) in tagged_words)
    pos_features = {
        'pos_IN': pos_counts['IN'],
        'pos_DT': pos_counts['DT'],
        'pos_NN': pos_counts['NN'],
        'pos_,': pos_counts[','],
        'pos_VBZ': pos_counts['VBZ'],
        'pos_WDT': pos_counts['WDT'],
        'pos_TO': pos_counts['TO'],
        'pos_VB': pos_counts['VB'],
        'pos_VBG': pos_counts['VBG'],
        'pos_.': pos_counts['.'],
        'pos_JJ': pos_counts['JJ'],
        'pos_NNS': pos_counts['NNS'],
        'pos_RB': pos_counts['RB'],
        'pos_CC': pos_counts['CC'],
        'pos_VBN': pos_counts['VBN'],
    }
    features = {
        'avg_word_length': avg_word_length,
        'avg_sent_length': avg_sent_length,
        'punctuation_count': punctuation_count,
        'stopword_count': stopword_count,
        'lemma_count': lemma_count,
        'named_entity_count': named_entity_count,
    }
    features.update(pos_features)
    return pd.Series(features)

press_me_button = st.button("Press me")

if press_me_button:
    input_features = df['paragraph'].apply(extract_features)
    predicted_llm = clf_loaded.predict(input_features)
    st.write(f"Predicted LLM: {predicted_llm[0]}")
    
    predicted_proba = clf_loaded.predict_proba(input_features)
    probabilities = predicted_proba[0]
    labels = clf_loaded.classes_

    # Create a mapping from old labels to new labels
    label_mapping = {1: 'gpt3', 2: 'gpt4', 3: 'googlebard', 4: 'huggingface'}

    # Apply the mapping to the labels
    new_labels = [label_mapping[label] for label in labels]

    # Create a dictionary that maps new labels to probabilities
    prob_dict = dict(zip(new_labels, probabilities))

    # Print the dictionary
    print(prob_dict)