sajalmadan0909 commited on
Commit
4039061
·
verified ·
1 Parent(s): 098769e

Upload 3 files

Browse files
Files changed (3) hide show
  1. gradio_app.py +567 -0
  2. readme_md.md +67 -0
  3. requirements_txt (2).txt +6 -0
gradio_app.py ADDED
@@ -0,0 +1,567 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from transformers import AutoTokenizer, AutoModelForTokenClassification
4
+ import warnings
5
+ warnings.filterwarnings("ignore")
6
+
7
+ class MultiModelIndianAddressNER:
8
+ def __init__(self):
9
+ # Available models configuration
10
+ self.models_config = {
11
+ "TinyBERT": {
12
+ "name": "shiprocket-ai/open-tinybert-indian-address-ner",
13
+ "description": "Lightweight and fast - 66.4M parameters",
14
+ "base_model": "TinyBERT"
15
+ },
16
+ "ModernBERT": {
17
+ "name": "shiprocket-ai/open-modernbert-indian-address-ner",
18
+ "description": "Modern architecture - 599MB model",
19
+ "base_model": "ModernBERT"
20
+ },
21
+ "IndicBERT": {
22
+ "name": "shiprocket-ai/open-indicbert-indian-address-ner",
23
+ "description": "Indic language optimized - 131MB model",
24
+ "base_model": "IndicBERT"
25
+ }
26
+ }
27
+
28
+ # Cache for loaded models
29
+ self.loaded_models = {}
30
+ self.loaded_tokenizers = {}
31
+
32
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
33
+
34
+ # Entity mappings (same for all models)
35
+ self.id2entity = {
36
+ "0": "O",
37
+ "1": "B-building_name",
38
+ "2": "I-building_name",
39
+ "3": "B-city",
40
+ "4": "I-city",
41
+ "5": "B-country",
42
+ "6": "I-country",
43
+ "7": "B-floor",
44
+ "8": "I-floor",
45
+ "9": "B-house_details",
46
+ "10": "I-house_details",
47
+ "11": "B-locality",
48
+ "12": "I-locality",
49
+ "13": "B-pincode",
50
+ "14": "I-pincode",
51
+ "15": "B-road",
52
+ "16": "I-road",
53
+ "17": "B-state",
54
+ "18": "I-state",
55
+ "19": "B-sub_locality",
56
+ "20": "I-sub_locality",
57
+ "21": "B-landmarks",
58
+ "22": "I-landmarks"
59
+ }
60
+
61
+ # Load default model (TinyBERT)
62
+ self.load_model("TinyBERT")
63
+
64
+ def load_model(self, model_key):
65
+ """Load a specific model if not already loaded"""
66
+ if model_key not in self.loaded_models:
67
+ print(f"Loading {model_key} model...")
68
+ model_name = self.models_config[model_key]["name"]
69
+
70
+ try:
71
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
72
+ model = AutoModelForTokenClassification.from_pretrained(model_name)
73
+ model.to(self.device)
74
+ model.eval()
75
+
76
+ self.loaded_tokenizers[model_key] = tokenizer
77
+ self.loaded_models[model_key] = model
78
+ print(f"✅ {model_key} model loaded successfully!")
79
+
80
+ except Exception as e:
81
+ print(f"❌ Error loading {model_key}: {str(e)}")
82
+ raise e
83
+
84
+ return self.loaded_tokenizers[model_key], self.loaded_models[model_key]
85
+
86
+ def predict(self, address, model_key="TinyBERT"):
87
+ """Extract entities from an Indian address using specified model"""
88
+ if not address.strip():
89
+ return {}, f"Using {model_key} model"
90
+
91
+ try:
92
+ # Load the selected model
93
+ tokenizer, model = self.load_model(model_key)
94
+
95
+ # Different approaches based on tokenizer type
96
+ if model_key == "IndicBERT":
97
+ # IndicBERT uses SentencePiece - use token-based approach
98
+ entities = self._predict_token_based(address, tokenizer, model)
99
+ else:
100
+ # TinyBERT and ModernBERT - use offset mapping approach
101
+ entities = self._predict_offset_based(address, tokenizer, model)
102
+
103
+ model_info = f"Using {model_key} ({self.models_config[model_key]['description']})"
104
+ return entities
105
+
106
+ def group_entities_sentencepiece(self, tokens, labels, confidences):
107
+ """Group entities for SentencePiece tokenization (IndicBERT) with proper text reconstruction"""
108
+ entities = {}
109
+ current_entity = None
110
+
111
+ for i, (token, label, conf) in enumerate(zip(tokens, labels, confidences)):
112
+ if token in ["<s>", "</s>", "<pad>", "<unk>"]:
113
+ continue
114
+
115
+ if label.startswith("B-"):
116
+ # Save previous entity
117
+ if current_entity:
118
+ entity_type = current_entity["type"]
119
+ if entity_type not in entities:
120
+ entities[entity_type] = []
121
+
122
+ # Clean up the text by removing SentencePiece markers and extra spaces
123
+ clean_text = self._clean_sentencepiece_text(current_entity["text"])
124
+ entities[entity_type].append({
125
+ "text": clean_text,
126
+ "confidence": current_entity["confidence"]
127
+ })
128
+
129
+ # Start new entity - handle SentencePiece format
130
+ entity_type = label[2:] # Remove "B-"
131
+ clean_token = token.replace("▁", " ").strip()
132
+ current_entity = {
133
+ "type": entity_type,
134
+ "text": clean_token,
135
+ "confidence": conf
136
+ }
137
+
138
+ elif label.startswith("I-") and current_entity:
139
+ # Continue current entity
140
+ entity_type = label[2:] # Remove "I-"
141
+ if entity_type == current_entity["type"]:
142
+ # Handle SentencePiece subword continuation
143
+ if token.startswith("▁"):
144
+ # New word boundary
145
+ current_entity["text"] += " " + token.replace("▁", "")
146
+ else:
147
+ # Subword continuation
148
+ current_entity["text"] += token
149
+ current_entity["confidence"] = (current_entity["confidence"] + conf) / 2
150
+
151
+ elif label == "O" and current_entity:
152
+ # End current entity
153
+ entity_type = current_entity["type"]
154
+ if entity_type not in entities:
155
+ entities[entity_type] = []
156
+
157
+ clean_text = self._clean_sentencepiece_text(current_entity["text"])
158
+ entities[entity_type].append({
159
+ "text": clean_text,
160
+ "confidence": current_entity["confidence"]
161
+ })
162
+ current_entity = None
163
+
164
+ # Add final entity if exists
165
+ if current_entity:
166
+ entity_type = current_entity["type"]
167
+ if entity_type not in entities:
168
+ entities[entity_type] = []
169
+
170
+ clean_text = self._clean_sentencepiece_text(current_entity["text"])
171
+ entities[entity_type].append({
172
+ "text": clean_text,
173
+ "confidence": current_entity["confidence"]
174
+ })
175
+
176
+ return entities
177
+
178
+ def _clean_sentencepiece_text(self, text):
179
+ """Clean SentencePiece text by removing markers and fixing spacing"""
180
+ # Remove SentencePiece markers
181
+ clean_text = text.replace("▁", " ")
182
+ # Remove extra spaces and clean up
183
+ clean_text = " ".join(clean_text.split())
184
+ # Remove trailing commas and spaces
185
+ clean_text = clean_text.strip().rstrip(",").strip()
186
+ return clean_text, model_info
187
+
188
+ except Exception as e:
189
+ return {}, f"Error with {model_key}: {str(e)}"
190
+
191
+ def _predict_offset_based(self, address, tokenizer, model):
192
+ """Offset-based prediction for TinyBERT and ModernBERT"""
193
+ inputs = tokenizer(
194
+ address,
195
+ return_tensors="pt",
196
+ truncation=True,
197
+ padding=True,
198
+ max_length=128,
199
+ return_offsets_mapping=True
200
+ )
201
+
202
+ # Extract offset mapping before moving to device
203
+ offset_mapping = inputs.pop("offset_mapping")[0]
204
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
205
+
206
+ # Predict
207
+ with torch.no_grad():
208
+ outputs = model(**inputs)
209
+ predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
210
+ predicted_ids = torch.argmax(predictions, dim=-1)
211
+ confidence_scores = torch.max(predictions, dim=-1)[0]
212
+
213
+ # Extract entities using offset mapping
214
+ return self.extract_entities_with_offsets(
215
+ address,
216
+ predicted_ids[0],
217
+ confidence_scores[0],
218
+ offset_mapping
219
+ )
220
+
221
+ def _predict_token_based(self, address, tokenizer, model):
222
+ """Token-based prediction for IndicBERT (SentencePiece)"""
223
+ inputs = tokenizer(
224
+ address,
225
+ return_tensors="pt",
226
+ truncation=True,
227
+ padding=True,
228
+ max_length=128
229
+ )
230
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
231
+
232
+ # Predict
233
+ with torch.no_grad():
234
+ outputs = model(**inputs)
235
+ predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
236
+ predicted_ids = torch.argmax(predictions, dim=-1)
237
+ confidence_scores = torch.max(predictions, dim=-1)[0]
238
+
239
+ # Convert to tokens and labels
240
+ tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
241
+ predicted_labels = [self.id2entity.get(str(id.item()), "O") for id in predicted_ids[0]]
242
+ confidences = confidence_scores[0].cpu().numpy()
243
+
244
+ # Group entities with proper text reconstruction
245
+ return self.group_entities_sentencepiece(tokens, predicted_labels, confidences)
246
+
247
+ def extract_entities_with_offsets(self, original_text, predicted_ids, confidences, offset_mapping):
248
+ """Extract entities using offset mapping for accurate text reconstruction"""
249
+ entities = {}
250
+ current_entity = None
251
+
252
+ for i, (pred_id, conf) in enumerate(zip(predicted_ids, confidences)):
253
+ if i >= len(offset_mapping):
254
+ break
255
+
256
+ start, end = offset_mapping[i]
257
+
258
+ # Skip special tokens (they have (0,0) mapping)
259
+ if start == end == 0:
260
+ continue
261
+
262
+ label = self.id2entity.get(str(pred_id.item()), "O")
263
+
264
+ if label.startswith("B-"):
265
+ # Save previous entity
266
+ if current_entity:
267
+ entity_type = current_entity["type"]
268
+ if entity_type not in entities:
269
+ entities[entity_type] = []
270
+ entities[entity_type].append({
271
+ "text": current_entity["text"],
272
+ "confidence": current_entity["confidence"]
273
+ })
274
+
275
+ # Start new entity
276
+ entity_type = label[2:] # Remove "B-"
277
+ current_entity = {
278
+ "type": entity_type,
279
+ "text": original_text[start:end],
280
+ "confidence": conf.item(),
281
+ "start": start,
282
+ "end": end
283
+ }
284
+
285
+ elif label.startswith("I-") and current_entity:
286
+ # Continue current entity
287
+ entity_type = label[2:] # Remove "I-"
288
+ if entity_type == current_entity["type"]:
289
+ # Extend the entity to include this token
290
+ current_entity["text"] = original_text[current_entity["start"]:end]
291
+ current_entity["confidence"] = (current_entity["confidence"] + conf.item()) / 2
292
+ current_entity["end"] = end
293
+
294
+ elif label == "O" and current_entity:
295
+ # End current entity
296
+ entity_type = current_entity["type"]
297
+ if entity_type not in entities:
298
+ entities[entity_type] = []
299
+ entities[entity_type].append({
300
+ "text": current_entity["text"],
301
+ "confidence": current_entity["confidence"]
302
+ })
303
+ current_entity = None
304
+
305
+ # Add final entity if exists
306
+ if current_entity:
307
+ entity_type = current_entity["type"]
308
+ if entity_type not in entities:
309
+ entities[entity_type] = []
310
+ entities[entity_type].append({
311
+ "text": current_entity["text"],
312
+ "confidence": current_entity["confidence"]
313
+ })
314
+
315
+ return entities
316
+
317
+ # Initialize the multi-model system
318
+ print("Initializing Multi-Model Indian Address NER...")
319
+ ner_system = MultiModelIndianAddressNER()
320
+ print("System ready!")
321
+
322
+ def process_address(address_text, selected_model):
323
+ """Process address and return formatted results with selected model"""
324
+ if not address_text.strip():
325
+ return "Please enter an address to analyze."
326
+
327
+ try:
328
+ # Extract entities using selected model
329
+ entities, model_info = ner_system.predict(address_text, selected_model)
330
+
331
+ if not entities:
332
+ return f"❌ No entities found in the provided address.\n\n**{model_info}**"
333
+
334
+ # Format results
335
+ result = f"📍 **Input Address:** {address_text}\n\n"
336
+ result += f"🤖 **{model_info}**\n\n"
337
+ result += "🏷️ **Extracted Entities:**\n\n"
338
+
339
+ # Sort entities by type for better presentation
340
+ entity_order = [
341
+ 'building_name', 'floor', 'house_details', 'road',
342
+ 'sub_locality', 'locality', 'landmarks', 'city',
343
+ 'state', 'country', 'pincode'
344
+ ]
345
+
346
+ displayed_entities = set()
347
+
348
+ # Display entities in order
349
+ for entity_type in entity_order:
350
+ if entity_type in entities and entity_type not in displayed_entities:
351
+ result += f"**{entity_type.replace('_', ' ').title()}:**\n"
352
+ for entity in entities[entity_type]:
353
+ confidence = entity['confidence']
354
+ text = entity['text']
355
+ confidence_icon = "🟢" if confidence > 0.8 else "🟡" if confidence > 0.6 else "🔴"
356
+ result += f" {confidence_icon} {text} (confidence: {confidence:.3f})\n"
357
+ result += "\n"
358
+ displayed_entities.add(entity_type)
359
+
360
+ # Display any remaining entities
361
+ for entity_type, entity_list in entities.items():
362
+ if entity_type not in displayed_entities:
363
+ result += f"**{entity_type.replace('_', ' ').title()}:**\n"
364
+ for entity in entity_list:
365
+ confidence = entity['confidence']
366
+ text = entity['text']
367
+ confidence_icon = "🟢" if confidence > 0.8 else "🟡" if confidence > 0.6 else "🔴"
368
+ result += f" {confidence_icon} {text} (confidence: {confidence:.3f})\n"
369
+ result += "\n"
370
+
371
+ result += "\n**Legend:**\n"
372
+ result += "🟢 High confidence (>0.8)\n"
373
+ result += "🟡 Medium confidence (0.6-0.8)\n"
374
+ result += "🔴 Low confidence (<0.6)\n"
375
+
376
+ return result
377
+
378
+ except Exception as e:
379
+ return f"❌ Error processing address: {str(e)}"
380
+
381
+ def compare_models(address_text):
382
+ """Compare results from all models"""
383
+ if not address_text.strip():
384
+ return "Please enter an address to compare models."
385
+
386
+ result = f"📍 **Address:** {address_text}\n\n"
387
+ result += "🔄 **Model Comparison:**\n\n"
388
+
389
+ for model_key in ner_system.models_config.keys():
390
+ try:
391
+ entities, model_info = ner_system.predict(address_text, model_key)
392
+ result += f"### {model_key}\n"
393
+ result += f"*{ner_system.models_config[model_key]['description']}*\n\n"
394
+
395
+ if entities:
396
+ entity_count = sum(len(entity_list) for entity_list in entities.values())
397
+ result += f"**Found {entity_count} entities:**\n"
398
+
399
+ for entity_type, entity_list in sorted(entities.items()):
400
+ for entity in entity_list:
401
+ confidence = entity['confidence']
402
+ text = entity['text']
403
+ confidence_icon = "🟢" if confidence > 0.8 else "🟡" if confidence > 0.6 else "🔴"
404
+ result += f" {confidence_icon} {entity_type}: {text} ({confidence:.3f})\n"
405
+ else:
406
+ result += "❌ No entities found\n"
407
+
408
+ result += "\n---\n\n"
409
+
410
+ except Exception as e:
411
+ result += f"### {model_key}\n❌ Error: {str(e)}\n\n---\n\n"
412
+
413
+ return result
414
+
415
+ # Sample addresses for examples
416
+ sample_addresses = [
417
+ "Shop No 123, Sunshine Apartments, Andheri West, Mumbai, 400058",
418
+ "DLF Cyber City, Sector 25, Gurgaon, Haryana",
419
+ "Flat 201, MG Road, Bangalore, Karnataka, 560001",
420
+ "Phoenix Mall, Kurla West, Mumbai",
421
+ "House No 456, Green Park Extension, New Delhi, 110016",
422
+ "Office 302, Tech Park, Electronic City, Bangalore, Karnataka, 560100"
423
+ ]
424
+
425
+ # Create Gradio interface
426
+ with gr.Blocks(title="Multi-Model Indian Address NER", theme=gr.themes.Soft()) as demo:
427
+ gr.Markdown("""
428
+ # 🏠 Multi-Model Indian Address Named Entity Recognition
429
+
430
+ Compare different transformer models for extracting components from Indian addresses. Choose between TinyBERT (fast), ModernBERT (modern), and IndicBERT (Indic-optimized).
431
+
432
+ **Supported entities:** Building Name, Floor, House Details, Road, Sub-locality, Locality, Landmarks, City, State, Country, Pincode
433
+ """)
434
+
435
+ with gr.Tab("Single Model Analysis"):
436
+ with gr.Row():
437
+ with gr.Column(scale=1):
438
+ model_dropdown = gr.Dropdown(
439
+ choices=list(ner_system.models_config.keys()),
440
+ value="TinyBERT",
441
+ label="Select Model",
442
+ info="Choose which model to use for entity extraction"
443
+ )
444
+
445
+ address_input = gr.Textbox(
446
+ label="Enter Indian Address",
447
+ placeholder="e.g., Shop No 123, Sunshine Apartments, Andheri West, Mumbai, 400058",
448
+ lines=3,
449
+ max_lines=5
450
+ )
451
+
452
+ submit_btn = gr.Button("🔍 Extract Entities", variant="primary")
453
+
454
+ gr.Markdown("### 📝 Sample Addresses (click to use):")
455
+ sample_buttons = []
456
+ for addr in sample_addresses:
457
+ btn = gr.Button(addr, size="sm")
458
+ btn.click(fn=lambda x=addr: x, outputs=address_input)
459
+ sample_buttons.append(btn)
460
+
461
+ with gr.Column(scale=1):
462
+ output_text = gr.Markdown(
463
+ label="Extracted Entities",
464
+ value="Select a model, enter an address, and click 'Extract Entities' to see the results."
465
+ )
466
+
467
+ # Event handlers for single model
468
+ submit_btn.click(
469
+ fn=process_address,
470
+ inputs=[address_input, model_dropdown],
471
+ outputs=output_text
472
+ )
473
+
474
+ address_input.submit(
475
+ fn=process_address,
476
+ inputs=[address_input, model_dropdown],
477
+ outputs=output_text
478
+ )
479
+
480
+ with gr.Tab("Model Comparison"):
481
+ with gr.Row():
482
+ with gr.Column(scale=1):
483
+ address_compare = gr.Textbox(
484
+ label="Enter Indian Address for Comparison",
485
+ placeholder="e.g., Shop No 123, Sunshine Apartments, Andheri West, Mumbai, 400058",
486
+ lines=3,
487
+ max_lines=5
488
+ )
489
+
490
+ compare_btn = gr.Button("🔄 Compare All Models", variant="secondary")
491
+
492
+ gr.Markdown("### 📝 Sample Addresses (click to use):")
493
+ sample_buttons_compare = []
494
+ for addr in sample_addresses:
495
+ btn = gr.Button(addr, size="sm")
496
+ btn.click(fn=lambda x=addr: x, outputs=address_compare)
497
+ sample_buttons_compare.append(btn)
498
+
499
+ with gr.Column(scale=1):
500
+ comparison_output = gr.Markdown(
501
+ label="Model Comparison Results",
502
+ value="Enter an address and click 'Compare All Models' to see how different models perform."
503
+ )
504
+
505
+ # Event handlers for comparison
506
+ compare_btn.click(
507
+ fn=compare_models,
508
+ inputs=address_compare,
509
+ outputs=comparison_output
510
+ )
511
+
512
+ address_compare.submit(
513
+ fn=compare_models,
514
+ inputs=address_compare,
515
+ outputs=comparison_output
516
+ )
517
+
518
+ with gr.Tab("Model Information"):
519
+ gr.Markdown("""
520
+ ## 📊 Available Models
521
+
522
+ ### TinyBERT
523
+ - **Base Model**: huawei-noah/TinyBERT_General_6L_768D
524
+ - **Model Size**: ~66.4M parameters
525
+ - **Advantages**: Fastest inference, lowest memory usage, mobile-friendly
526
+ - **Best for**: Real-time applications, edge deployment
527
+
528
+ ### ModernBERT
529
+ - **Base Model**: Modern transformer architecture
530
+ - **Model Size**: ~599MB
531
+ - **Advantages**: Latest architectural improvements, balanced performance
532
+ - **Best for**: High-accuracy requirements with reasonable speed
533
+
534
+ ### IndicBERT
535
+ - **Base Model**: Indic language optimized transformer
536
+ - **Model Size**: ~131MB
537
+ - **Advantages**: Optimized for Indian languages and contexts
538
+ - **Best for**: Mixed language addresses, regional Indian contexts
539
+
540
+ ## 🎯 Entity Types Supported
541
+
542
+ All models can extract the following entities:
543
+ - **Building Name**: Apartment/building names
544
+ - **Floor**: Floor numbers and details
545
+ - **House Details**: House/flat numbers
546
+ - **Road**: Street and road names
547
+ - **Sub-locality**: Sector, block details
548
+ - **Locality**: Area, neighborhood names
549
+ - **Landmarks**: Notable nearby locations
550
+ - **City**: City names
551
+ - **State**: State names
552
+ - **Country**: Country names
553
+ - **Pincode**: Postal codes
554
+ """)
555
+
556
+ gr.Markdown("""
557
+ ---
558
+ **Models:**
559
+ - [TinyBERT](https://huggingface.co/shiprocket-ai/open-tinybert-indian-address-ner) |
560
+ [ModernBERT](https://huggingface.co/shiprocket-ai/open-modernbert-indian-address-ner) |
561
+ [IndicBERT](https://huggingface.co/shiprocket-ai/open-indicbert-indian-address-ner)
562
+
563
+ **About:** These models are specifically trained on Indian address patterns and can handle various formats and styles common in Indian addresses.
564
+ """)
565
+
566
+ if __name__ == "__main__":
567
+ demo.launch()
readme_md.md ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Multi-Model Indian Address NER Demo
2
+
3
+ This is a Gradio-based demo that allows you to compare three different Indian Address NER models:
4
+ - [TinyBERT](https://huggingface.co/shiprocket-ai/open-tinybert-indian-address-ner) - Lightweight and fast
5
+ - [ModernBERT](https://huggingface.co/shiprocket-ai/open-modernbert-indian-address-ner) - Modern architecture
6
+ - [IndicBERT](https://huggingface.co/shiprocket-ai/open-indicbert-indian-address-ner) - Indic language optimized
7
+
8
+ ## What it does
9
+
10
+ This application allows you to:
11
+
12
+ 1. **Single Model Analysis**: Choose one model and extract entities from Indian addresses
13
+ 2. **Model Comparison**: Compare how all three models perform on the same address
14
+ 3. **Interactive Testing**: Use sample addresses or input your own
15
+
16
+ The models can identify:
17
+
18
+ - Building names
19
+ - Floor numbers
20
+ - House details
21
+ - Roads
22
+ - Sub-localities
23
+ - Localities
24
+ - Landmarks
25
+ - Cities
26
+ - States
27
+ - Countries
28
+ - Pincodes
29
+
30
+ ## How to use
31
+
32
+ ### Single Model Analysis
33
+ 1. Select a model from the dropdown (TinyBERT, ModernBERT, or IndicBERT)
34
+ 2. Enter an Indian address in the text box
35
+ 3. Click "Extract Entities" or press Enter
36
+ 4. View the extracted entities with confidence scores
37
+
38
+ ### Model Comparison
39
+ 1. Go to the "Model Comparison" tab
40
+ 2. Enter an address
41
+ 3. Click "Compare All Models"
42
+ 4. See how each model performs on the same input
43
+
44
+ ## Example addresses
45
+
46
+ - Shop No 123, Sunshine Apartments, Andheri West, Mumbai, 400058
47
+ - DLF Cyber City, Sector 25, Gurgaon, Haryana
48
+ - Flat 201, MG Road, Bangalore, Karnataka, 560001
49
+
50
+ ## Model Information
51
+
52
+ ### TinyBERT
53
+ - **Parameters**: ~66.4M
54
+ - **Advantages**: Fastest inference, lowest memory
55
+ - **Best for**: Real-time applications, mobile deployment
56
+
57
+ ### ModernBERT
58
+ - **Parameters**: ~599MB model
59
+ - **Advantages**: Modern architecture, balanced performance
60
+ - **Best for**: High accuracy with reasonable speed
61
+
62
+ ### IndicBERT
63
+ - **Parameters**: ~131MB model
64
+ - **Advantages**: Optimized for Indian languages/contexts
65
+ - **Best for**: Mixed language addresses, regional contexts
66
+
67
+ **Framework**: PyTorch + Transformers
requirements_txt (2).txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ torch>=1.9.0
2
+ transformers>=4.21.0
3
+ gradio>=4.0.0
4
+ numpy>=1.21.0
5
+ tokenizers>=0.13.0
6
+ sentencepiece>=0.1.99