Spaces:
Running
Running
Upload 4 files
Browse files- .gitattributes +2 -0
- app.py +117 -0
- requirements.txt +1 -0
- tfidf_vectorizer.sav +3 -0
- trained_model.sav +3 -0
.gitattributes
CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
tfidf_vectorizer.sav filter=lfs diff=lfs merge=lfs -text
|
37 |
+
trained_model.sav filter=lfs diff=lfs merge=lfs -text
|
app.py
ADDED
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pickle
|
3 |
+
|
4 |
+
# Load the trained model
|
5 |
+
model_filename = 'trained_model.sav'
|
6 |
+
with open(model_filename, 'rb') as model_file:
|
7 |
+
model = pickle.load(model_file)
|
8 |
+
|
9 |
+
# Load the TF-IDF vectorizer
|
10 |
+
vectorizer_filename = 'tfidf_vectorizer.sav'
|
11 |
+
with open(vectorizer_filename, 'rb') as vectorizer_file:
|
12 |
+
vectorizer = pickle.load(vectorizer_file)
|
13 |
+
|
14 |
+
# Define a function to predict cyberbullying and filter bad words
|
15 |
+
def predict_cyberbullying_and_filter_bad_words(text):
|
16 |
+
text = text.lower()
|
17 |
+
# Vectorize the input text using the loaded vectorizer
|
18 |
+
text_tfidf = vectorizer.transform([text])
|
19 |
+
|
20 |
+
# Use the trained model to make a prediction
|
21 |
+
prediction = model.predict(text_tfidf)
|
22 |
+
|
23 |
+
# Initialize variables to store the filtered text and bad words
|
24 |
+
filtered_text = text
|
25 |
+
bad_words = []
|
26 |
+
|
27 |
+
# List of cyberbullying words
|
28 |
+
cyberbullying_words = ["bitch", "fuck", "asshole", "shitty", "ass", "slut", "cunt", "motherfucker", "wanker", "dick", "shit", "bastard",
|
29 |
+
"pissed off", "arse", "bugger", "bloody", "whore", "anal", "anus", "arse", "arrse", "assbag", "assbandit",
|
30 |
+
"assbanger", "assbite", "assclown", "asscock", "asscracker", "asses", "assface", "assfuck", "assfucker",
|
31 |
+
"assfukka", "assgoblin", "asshat", "asshead", "asshole", "assholes", "asshopper", "assjacker", "asslick",
|
32 |
+
"asslicker", "assmonkey", "assmunch", "assmuncher", "assnigger", "asspirate", "assshit", "assshole", "asssucker",
|
33 |
+
"asswad", "asswhole", "asswipe", "auto erotic", "autoerotic", "asswound", "boobs", "bitch", "b1tch", "ballbag",
|
34 |
+
"balls", "ballsack", "bampot", "bangbros", "bareback", "barely legal", "barenaked", "bastard", "bastardo", "bastinado",
|
35 |
+
"bbw", "bdsm", "bitches", "bitchin", "bitching", "bitchtits", "bitchy", "blowjob", "boob", "boobs", "booobs", "boooobs",
|
36 |
+
"booooobs", "booooooobs", "brotherfucker", "bumblefuck", "bung hole", "buttcheeks", "buttfucka",
|
37 |
+
"buttfucker", "butthole", "buttmuch", "buttplug", "cock", "cocksucker", "camgirl", "camslut",
|
38 |
+
"camwhore", "circlejerk", "clit", "cleveland steamer", "clit", "clitface", "clitfuck", "clitoris",
|
39 |
+
"clits", "clover clamps", "clusterfuck", "cockass", "cockbite", "cockburger", "cockeye", "cockface",
|
40 |
+
"cockfucker", "cockhead", "cockjockey", "cockknoker", "cocklump", "cockmaster", "cockmongler",
|
41 |
+
"cockmongruel", "cockmonkey", "cockmunch", "cockmuncher", "cocknose", "cocknugget", "cocks",
|
42 |
+
"cockshit", "cocksmith", "cocksmoke", "cocksmoker", "cocksniffer", "cocksuck", "cocksucked",
|
43 |
+
"cocksucker", "cocksucking", "cocksucks", "cocksuka", "cocksukka", "cockwaffle", "cok", "cokmuncher",
|
44 |
+
"coons", "cooter", "coprolagnia", "coprophilia", "cornhole", "creampie", "crotte", "cum", "cumbubble",
|
45 |
+
"cumdumpster", "cumguzzler", "cumjockey", "cummer", "cumming", "cums", "cumshot", "cumslut", "cumtart",
|
46 |
+
"cunilingus", "cunillingus", "cunnie", "cunnilingus", "cunt", "cuntass", "cuntface", "cunthole",
|
47 |
+
"cuntlick", "cuntlicker", "cuntlicking", "cuntrag", "cunts", "cuntslut", "cyalis", "cyberfuc",
|
48 |
+
"cyberfuck", "cyberfucked", "cyberfucker", "cyberfuckers", "cyberfucking", "d1ck", "darkie",
|
49 |
+
"date rape", "daterape", "deep throat", "deepthroat", "deggo", "dendrophilia", "dick", "dickbag",
|
50 |
+
"dickbeaters", "dickface", "dickfuck", "dickfucker", "dickhead", "dickhole", "dickjuice", "dickmilk ",
|
51 |
+
"dickmonger", "dicks", "dickslap", "dicksucker", "dicksucking", "dicktickler", "dickwad",
|
52 |
+
"dickweasel", "dickweed", "dickwod", "dike", "dildo", "dildos", "doggystyle ","donkeyribber",
|
53 |
+
"doochbag", "double dong", "penetration", "doublelift", "douche", "douchebag", "dumbass",
|
54 |
+
"dumbcunt", "dumbfuck", "ejaculate", "ejaculated", "ejaculates", "ejaculating", "ejaculatings",
|
55 |
+
"ejaculation", "ejakulate", "erotism", "eunuch", "f u c k", "fagfucker", "fagging", "faggit",
|
56 |
+
"faggitt", "faggot", "faggotcock", "fanny", "fannyflaps", "fannyfucker", "fanyy", "fatass", "fuck",
|
57 |
+
"fucker", "fucking", "fecal", "feck", "fecker", "felch", "felching", "fellate", "fellatio", "feltch",
|
58 |
+
"female squirting", "femdom", "figging", "fingerbang", "fingerfuck", "fingerfucked", "fingerfucker",
|
59 |
+
"fingerfuckers", "fingerfucking", "fingerfucks", "fingering", "fistfuck", "fistfucked", "fistfucker",
|
60 |
+
"fistfuckers", "fistfucking", "fistfuckings", "fistfucks", "fisting",
|
61 |
+
"footjob", "frotting", "fuckass", "fuckbag", "fuckboy", "fuckbrain", "fuckbutt",
|
62 |
+
"fuckersucker", "fuckface", "fuckhead", "fuckheads", "fuckhole", "fuckin", "fucking",
|
63 |
+
"fuckings", "fucking shit motherfucker", "fuckme", "fucknut", "fucknutt", "fuckoff", "fucks",
|
64 |
+
"fuckstick", "fucktard", "fucktards", "fucktart", "fucktwat", "fuckup", "fuckwad", "fuckwhit",
|
65 |
+
"fuckwit", "fuckwitt", "fudge packer", "fudgepacker", "fuk", "fuker", "fukker", "fukkin", "fuks",
|
66 |
+
"fukwhit", "fukwit", "futanari", "fux", "fuxor", "g-spot", "gangbang", "gangbanged", "gangbangs",
|
67 |
+
"gayass", "gaybob", "gaydo", "gayfuck", "gayfuckist", "goregasm", "handjob", "hard core", "hardcore",
|
68 |
+
"hardcoresex", "hooker", "arse", "ass fuck", "ass hole", "assfucker", "asshole", "assshole", "bastard",
|
69 |
+
"fucking bitch", "cock", "bloody hell", "boong", "cockfucker", "cocksuck", "coon", "cyberfuck",
|
70 |
+
"erection", "erotic", "faggot fuck", "fuck off", "fuck you", "fuckass", "fuckhole", "hardcore",
|
71 |
+
"lesbian", "lesbians", "motherfuck", "negro", "nigger", "orgasim", "orgasm", "penis", "penisfucker",
|
72 |
+
"piss", "piss off", "pussy", "sexy shit", "sexy slut", "son of a bitch", "suck tits", "xxx",
|
73 |
+
"kill yourself", "fuck yourself", "beheading", "terrorist"]
|
74 |
+
|
75 |
+
# Check for and filter out bad words from the text
|
76 |
+
for word in cyberbullying_words:
|
77 |
+
if word.lower() in text:
|
78 |
+
filtered_text = filtered_text.replace(word, '*' * len(word))
|
79 |
+
bad_words.append(word)
|
80 |
+
|
81 |
+
# Map the prediction to a human-readable label
|
82 |
+
|
83 |
+
return prediction[0], filtered_text, bad_words
|
84 |
+
|
85 |
+
# Create a Streamlit app
|
86 |
+
st.title("Cyberbullying Detection App (English)")
|
87 |
+
|
88 |
+
# Add a text input field
|
89 |
+
user_input = st.text_area("Enter text:", "")
|
90 |
+
|
91 |
+
# Predict when a button is clicked
|
92 |
+
if st.button("Predict"):
|
93 |
+
if user_input:
|
94 |
+
prediction, filtered_text, bad_words = predict_cyberbullying_and_filter_bad_words(user_input)
|
95 |
+
if prediction != "not_cyberbullying":
|
96 |
+
st.write("Prediction: Cyberbullying")
|
97 |
+
st.write(f"Cyberbullying Type: {prediction}")
|
98 |
+
else:
|
99 |
+
st.write("Prediction: Not Cyberbullying")
|
100 |
+
if bad_words:
|
101 |
+
st.write(f"Bad Words: {', '.join(bad_words)}")
|
102 |
+
else:
|
103 |
+
st.write("<span style='color:cyan;'>No bad words found.</span>", unsafe_allow_html=True)
|
104 |
+
if bad_words:
|
105 |
+
st.write("Filtered Text:")
|
106 |
+
st.write(f"<span style='color:red; font-weight:bold'>{filtered_text}</span>", unsafe_allow_html=True)
|
107 |
+
else:
|
108 |
+
st.write("Original Text:")
|
109 |
+
st.write(f"{filtered_text}", unsafe_allow_html=True)
|
110 |
+
|
111 |
+
|
112 |
+
st.header("Sample Texts")
|
113 |
+
st.write("It's always the filthy " + "<span style='color:red; font-weight:bold'>bitch</span> that creates problem between us", unsafe_allow_html=True)
|
114 |
+
st.write("Do you believe it is appropriate to refer to a Muslim as a " + "<span style='color:red; font-weight:bold'>terrorist</span>?", unsafe_allow_html=True)
|
115 |
+
st.write("I hope you're doing well and having a great day. Let's catch up soon! 😊")
|
116 |
+
st.write("The team's score is disgraceful.")
|
117 |
+
|
requirements.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
scikit-learn
|
tfidf_vectorizer.sav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4348f9a8bf9f6296ccebee2a79efca27ff06d2ff76f48e8c587000c17143a71e
|
3 |
+
size 1047315
|
trained_model.sav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7ebdf45a74171d8a3dc272d2d21b3022979576f601f20ad5ec05445887935999
|
3 |
+
size 1440050
|