naftalindeapo commited on
Commit
d7068b2
·
verified ·
1 Parent(s): e35ab5d

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +134 -0
  2. requirements.txt +6 -0
app.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import torch
3
+ import gradio as gr
4
+ from transformers import pipeline, AutoTokenizer
5
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
6
+
7
+ class AbuseHateProfanityDetector:
8
+ def __init__(self):
9
+ # Device configuration (CPU or GPU)
10
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
11
+
12
+ # Initialize detection models
13
+ self.Abuse_detector = pipeline("text-classification", model="Hate-speech-CNERG/english-abusive-MuRIL", device=self.device)
14
+ self.Hate_speech_detector = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-hate-latest", device=self.device)
15
+ self.Profanity_detector = pipeline("text-classification", model="tarekziade/pardonmyai", device=self.device)
16
+
17
+ # Load tokenizers
18
+ self.abuse_tokenizer = AutoTokenizer.from_pretrained('Hate-speech-CNERG/english-abusive-MuRIL')
19
+ self.hate_speech_tokenizer = AutoTokenizer.from_pretrained('cardiffnlp/twitter-roberta-base-hate-latest')
20
+ self.profanity_tokenizer = AutoTokenizer.from_pretrained('tarekziade/pardonmyai')
21
+
22
+ # Define max token sizes for each model
23
+ self.Abuse_max_context_size = 512
24
+ self.HateSpeech_max_context_size = 512
25
+ self.Profanity_max_context_size = 512
26
+
27
+ def preprocess_and_clean_text(self, text: str) -> str:
28
+ """
29
+ Preprocesses and cleans the text.
30
+ """
31
+ stammering_pattern = r'\b(\w+)\s*[,;]+\s*(\1\b\s*[,;]*)+'
32
+ passage_without_stammering = re.sub(stammering_pattern, r'\1', text)
33
+ passage_without_um = re.sub(r'\bum\b', ' ', passage_without_stammering)
34
+ modified_text = re.sub(r'\s*,+\s*', ', ', passage_without_um)
35
+ processed_text = re.sub(r'\s+([^\w\s])', r'\1', modified_text)
36
+ processed_text = re.sub(r'\s+', ' ', processed_text)
37
+ pattern = r'(\.\s*)+'
38
+ cleaned_text = re.sub(pattern, '.', processed_text)
39
+ return cleaned_text.strip()
40
+
41
+ def token_length(self, text, tokenizer):
42
+ """
43
+ Computes the token length of a text.
44
+ """
45
+ tokens = tokenizer.encode(text, add_special_tokens=False)
46
+ return len(tokens)
47
+
48
+ def create_token_length_wrapper(self, tokenizer):
49
+ """
50
+ Creates a closure to calculate token length using the tokenizer.
51
+ """
52
+ def token_length_wrapper(text):
53
+ return self.token_length(text, tokenizer)
54
+ return token_length_wrapper
55
+
56
+ def chunk_text(self, text, tokenizer, max_length):
57
+ """
58
+ Chunks the input text based on the max token length and cleans the text.
59
+ """
60
+ text = self.preprocess_and_clean_text(text)
61
+ token_length_wrapper = self.create_token_length_wrapper(tokenizer)
62
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=max_length - 2, length_function=token_length_wrapper)
63
+ chunks = text_splitter.split_text(text)
64
+ return chunks
65
+
66
+ def classify_text(self, text: str):
67
+ """
68
+ Classifies text for abuse, hate speech, and profanity using the respective models.
69
+ """
70
+ # Split text into chunks for each classification model
71
+ abuse_chunks = self.chunk_text(text, self.abuse_tokenizer, self.Abuse_max_context_size)
72
+ hate_speech_chunks = self.chunk_text(text, self.hate_speech_tokenizer, self.HateSpeech_max_context_size)
73
+ profanity_chunks = self.chunk_text(text, self.profanity_tokenizer, self.Profanity_max_context_size)
74
+
75
+ # Initialize flags
76
+ abusive_flag = False
77
+ hatespeech_flag = False
78
+ profanity_flag = False
79
+
80
+ # Detect Abuse
81
+ for chunk in abuse_chunks:
82
+ result = self.Abuse_detector(chunk)
83
+ if result[0]['label'] == 'LABEL_1': # Assuming LABEL_1 is abusive content
84
+ abusive_flag = True
85
+
86
+ # Detect Hate Speech
87
+ for chunk in hate_speech_chunks:
88
+ result = self.Hate_speech_detector(chunk)
89
+ if result[0]['label'] == 'HATE': # Assuming HATE label indicates hate speech
90
+ hatespeech_flag = True
91
+
92
+ # Detect Profanity
93
+ for chunk in profanity_chunks:
94
+ result = self.Profanity_detector(chunk)
95
+ if result[0]['label'] == 'OFFENSIVE': # Assuming OFFENSIVE label indicates profanity
96
+ profanity_flag = True
97
+
98
+ # Return classification results
99
+ return {
100
+ "abusive_flag": abusive_flag,
101
+ "hatespeech_flag": hatespeech_flag,
102
+ "profanity_flag": profanity_flag
103
+ }
104
+
105
+ def extract_speaker_text(self, transcript, client_label="Client", care_provider_label="Care Provider"):
106
+ """
107
+ Extracts text spoken by the client and the care provider from the transcript.
108
+ """
109
+ client_text = []
110
+ care_provider_text = []
111
+
112
+ lines = transcript.split("\n")
113
+ for line in lines:
114
+ if line.startswith(client_label + ":"):
115
+ client_text.append(line[len(client_label) + 1:].strip())
116
+ elif line.startswith(care_provider_label + ":"):
117
+ care_provider_text.append(line[len(care_provider_label) + 1:].strip())
118
+
119
+ return " ".join(client_text), " ".join(care_provider_text)
120
+
121
+ # Gradio interface for the web app
122
+ detector = AbuseHateProfanityDetector()
123
+
124
+ interface = gr.Interface(
125
+ fn=detector.classify_text,
126
+ inputs=[gr.Textbox(label="Enter text")],
127
+ outputs="json",
128
+ title="Abuse, Hate Speech, and Profanity Detection",
129
+ description="Enter text to detect whether it contains abusive, hateful, or offensive content."
130
+ )
131
+
132
+ # Launch the Gradio app
133
+ if __name__ == "__main__":
134
+ interface.launch(share=True)
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ re
2
+ mypy
3
+ torch
4
+ gradio
5
+ langchain
6
+ transformers