File size: 11,189 Bytes
c1410ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277

import torch 
import gradio as gr
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch.nn as nn
from torch.serialization import add_safe_globals

class HierarchicalLabelEncoder:
    def __init__(self):
        self.categories = [
            "Any Other Cyber Crime",
            "Crime Against Women & Children",
            "Cryptocurrency Crime",
            "Cyber Attack/ Dependent Crimes",
            "Cyber Terrorism",
            "Hacking Damage to computercomputer system etc",
            "Online Cyber Trafficking",
            "Online Financial Fraud", 
            "Online Gambling Betting",
            "Online and Social Media Related Crime",
            "Ransomware",
            "Report Unlawful Content"
        ]
        self.subcategories_map = {
            "Any Other Cyber Crime": ["Other"],
            "Crime Against Women & Children": [
                "Computer Generated CSAM/CSEM",
                "Cyber Blackmailing & Threatening",
                "Sexual Harassment"
            ],
            "Cryptocurrency Crime": ["Cryptocurrency Fraud"],
            "Cyber Attack/ Dependent Crimes": [
                "Data Breach/Theft",
                "Denial of Service (DoS)/Distributed Denial of Service (DDOS) attacks",
                "Hacking/Defacement",
                "Malware Attack",
                "Ransomware Attack",
                "SQL Injection",
                "Tampering with computer source documents"
            ],
            "Cyber Terrorism": ["Cyber Terrorism"],
            "Hacking Damage to computercomputer system etc": [
                "Damage to computer computer systems etc",
                "Email Hacking",
                "Tampering with computer source documents",
                "Unauthorised AccessData Breach",
                "Website DefacementHacking"
            ],
            "Online Cyber Trafficking": ["Online Trafficking"],
            "Online Financial Fraud": [
                "Business Email CompromiseEmail Takeover",
                "DebitCredit Card FraudSim Swap Fraud",
                "DematDepository Fraud",
                "EWallet Related Fraud",
                "Fraud CallVishing",
                "Internet Banking Related Fraud",
                "UPI Related Frauds"
            ],
            "Online Gambling Betting": ["Online Gambling Betting"],
            "Online and Social Media Related Crime": [
                "Cheating by Impersonation",
                "Cyber Bullying Stalking Sexting",
                "EMail Phishing",
                "FakeImpersonating Profile",
                "Impersonating Email",
                "Intimidating Email",
                "Online Job Fraud",
                "Online Matrimonial Fraud",
                "Profile Hacking Identity Theft",
                "Provocative Speech for unlawful acts"
            ],
            "Ransomware": ["Ransomware"],
            "Report Unlawful Content": ["Against Interest of sovereignty or integrity of India"]
        }
        
        self.category_to_idx = {cat: idx for idx, cat in enumerate(self.categories)}
        self.idx_to_category = {idx: cat for cat, idx in self.category_to_idx.items()}
        
        self.subcategories = []
        for subcat_list in self.subcategories_map.values():
            self.subcategories.extend(subcat_list)
        self.subcategories = sorted(list(set(self.subcategories)))
        
        self.subcategory_to_idx = {subcat: idx for idx, subcat in enumerate(self.subcategories)}
        self.idx_to_subcategory = {idx: subcat for subcat, idx in self.subcategory_to_idx.items()}

    def encode(self, category, subcategory):
        if category not in self.subcategories_map or subcategory not in self.subcategories_map[category]:
            raise ValueError(f"Invalid category-subcategory pair: {category} - {subcategory}")
        return (self.category_to_idx[category], self.subcategory_to_idx[subcategory])

    def decode(self, category_idx, subcategory_idx):
        return (self.idx_to_category[category_idx], self.idx_to_subcategory[subcategory_idx])

class HinglishClassifier(nn.Module):
    def __init__(self, num_categories, num_subcategories):
        super().__init__()
        self.base_model = DistilBertForSequenceClassification.from_pretrained(
            "distilbert-base-uncased",
            num_labels=num_categories
        )
        self.subcategory_classifier = nn.Sequential(
            nn.Linear(768, 512),
            nn.LayerNorm(512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, num_subcategories)
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.base_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            output_hidden_states=True
        )
        hidden_state = outputs.hidden_states[-1][:, 0]
        return outputs.logits, self.subcategory_classifier(hidden_state)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
label_encoder = None
model = None
tokenizer = None

def load_model_components():
    global label_encoder, model, tokenizer
    try:
        if model is None:
            checkpoint = torch.load('final_model.pt', map_location=device, weights_only=False)
            label_encoder = HierarchicalLabelEncoder()
            model = HinglishClassifier(
                num_categories=len(label_encoder.categories),
                num_subcategories=len(label_encoder.subcategories)
            )
            model.load_state_dict(checkpoint['model_state_dict'])
            model.to(device)
            model.eval()
            tokenizer = DistilBertTokenizer.from_pretrained('tokenizer')
    except Exception as e:
        print(f"Error loading model: {str(e)}")
        raise

def predict_category(text):
    global label_encoder, model, tokenizer
    if model is None:
        load_model_components()
    
    temperature = 1.0
    inputs = tokenizer(
        text,
        truncation=True,
        padding='max_length',
        max_length=128,
        return_tensors='pt'
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        category_logits, subcategory_logits = model(
            inputs['input_ids'],
            inputs['attention_mask']
        )
        
        category_probs = torch.softmax(category_logits/temperature, dim=1)
        subcategory_probs = torch.softmax(subcategory_logits/temperature, dim=1)
        
        k = 3
        top_k_categories = torch.topk(category_probs, k)
        top_k_subcategories = torch.topk(subcategory_probs, k)
        
        # Format results for DataFrame display
        categories_data = [
            [label_encoder.idx_to_category[idx.item()], 
             f"{top_k_categories.values[0][i].item():.2%}"]
            for i, idx in enumerate(top_k_categories.indices[0])
        ]
        
        subcategories_data = [
            [label_encoder.idx_to_subcategory[idx.item()],
             f"{top_k_subcategories.values[0][i].item():.2%}"]
            for i, idx in enumerate(top_k_subcategories.indices[0])
        ]
        
        return [categories_data, subcategories_data]

iface = gr.Interface(
    title="Cyber Crime Complaint Classifier",
    description="This system classifies cyber crime complaints into categories and subcategories. Enter your complaint in either Hindi/English/Hinglish text to get started.",
    fn=predict_category,
    inputs=[
        gr.Textbox(
            lines=15,
            label="Enter your complaint text",
            placeholder="Type your cyber crime complaint here...",
            scale=6,
            show_copy_button=True
        )
    ],
    outputs=[
        gr.DataFrame(
            headers=["Categories", "Confidence"],
            row_count=3,
            col_count=2,
            interactive=False,
            label="Top 3 Categories"
        ),
        gr.DataFrame(
            headers=["Subcategories", "Confidence"],
            row_count=3,
            col_count=2,
            interactive=False,
            label="Top 3 Subcategories"
        )
    ],
    css="""
    .gradio-container {
    max-width: 1200px !important;
    margin: auto !important;
    }
    h1 {
        text-align: center !important;
        font-size: 42px !important;
        font-weight: bold !important;
        margin-bottom: 16px !important;
    }
    p {
        text-align: left !important;
        font-size: 18px !important;
        margin-bottom: 24px !important;
    }
    /* Add textbox styling */
    .gradio-textbox textarea {
        font-size: 48px !important;
        line-height: 1.5 !important;
    }
    .primary-category label {
        font-size: 14px !important;
        font-weight: 600 !important;
    }
    .examples-container {
        background: #f7f7f7 !important;
        padding: 20px !important;
        border-radius: 8px !important;
        margin-top: 24px !important;
        border: 1px solid #ddd !important;
    }
    .examples-container h3 {
        font-size: 16px !important;
        margin-bottom: 12px !important;
    }
    .submit-row {
        display: flex !important;
        gap: 12px !important;
        margin-top: 16px !important;
    }
    .submit-btn, .clear-btn {
        flex: 1 !important;
        padding: 8px 16px !important;
    }
    """,
    examples=[
        ["Someone hacked my email account and sent spam to all my contacts."],
        ["I lost money through a UPI fraud transaction. Mera UPI froud reverse karva dijiye. Paytm customer care number se complaint bhi kiya hai."],
        ["Someone created fake Instagram profile with my daughter's photos and sending vulgar messages"],
        ["Ek ladke ne meri beti ko social media pe harass kiya aur uske private photos leak karne ki dhamki di"],
        ["Mere bank account se UPI fraud ke through 50,000 rupay nikal liye"],
        ["Koi mere naam se fake Facebook profile bana ke mere dosto ko paise ke liye message kar raha hai"],
        ["Kisi ne mere laptop ko hack karke usme se personal photos access kar liye hain. Ab wo photos viral karne ki dhamki dekar paise ki demand kar rahe hain. Ye bahut sensitive personal data breach hai"],
        ["Dating app pe match hone ke baad video call pe compromising situation record kar li aur ab blackmail kar rahe hain. Bahut badi amount ki demand kar rahe hain warna video viral karne ki dhamki de rahe hain."],
        ["Mere computer pe ransomware attack hua hai, sara data encrypt ho gaya hai"],
        ["Online cricket betting app pe paise lagaye, ab withdrawal nahi kar pa raha"],
        ["Maine cryptocurrency trading bot ke subscription ke paise pay kiye. Bot ne automatic trading karke sara balance loss kar diya aur ab company ka koi response nahi hai."],
        ["Maine online cricket betting app pe significant amount invest kiya tha. Initially small amounts ki withdrawal hui lekin ab bade amount ki withdrawal request process nahi ho rahi. App operators ka koi response nahi aa raha hai."]
    ]
)
if __name__ == "__main__":
    add_safe_globals([HierarchicalLabelEncoder])
    iface.launch(share=True)