Upload 3 files
Browse files- gradio_app.py +567 -0
- readme_md.md +67 -0
- requirements_txt (2).txt +6 -0
gradio_app.py
ADDED
@@ -0,0 +1,567 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import torch
|
3 |
+
from transformers import AutoTokenizer, AutoModelForTokenClassification
|
4 |
+
import warnings
|
5 |
+
warnings.filterwarnings("ignore")
|
6 |
+
|
7 |
+
class MultiModelIndianAddressNER:
|
8 |
+
def __init__(self):
|
9 |
+
# Available models configuration
|
10 |
+
self.models_config = {
|
11 |
+
"TinyBERT": {
|
12 |
+
"name": "shiprocket-ai/open-tinybert-indian-address-ner",
|
13 |
+
"description": "Lightweight and fast - 66.4M parameters",
|
14 |
+
"base_model": "TinyBERT"
|
15 |
+
},
|
16 |
+
"ModernBERT": {
|
17 |
+
"name": "shiprocket-ai/open-modernbert-indian-address-ner",
|
18 |
+
"description": "Modern architecture - 599MB model",
|
19 |
+
"base_model": "ModernBERT"
|
20 |
+
},
|
21 |
+
"IndicBERT": {
|
22 |
+
"name": "shiprocket-ai/open-indicbert-indian-address-ner",
|
23 |
+
"description": "Indic language optimized - 131MB model",
|
24 |
+
"base_model": "IndicBERT"
|
25 |
+
}
|
26 |
+
}
|
27 |
+
|
28 |
+
# Cache for loaded models
|
29 |
+
self.loaded_models = {}
|
30 |
+
self.loaded_tokenizers = {}
|
31 |
+
|
32 |
+
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
33 |
+
|
34 |
+
# Entity mappings (same for all models)
|
35 |
+
self.id2entity = {
|
36 |
+
"0": "O",
|
37 |
+
"1": "B-building_name",
|
38 |
+
"2": "I-building_name",
|
39 |
+
"3": "B-city",
|
40 |
+
"4": "I-city",
|
41 |
+
"5": "B-country",
|
42 |
+
"6": "I-country",
|
43 |
+
"7": "B-floor",
|
44 |
+
"8": "I-floor",
|
45 |
+
"9": "B-house_details",
|
46 |
+
"10": "I-house_details",
|
47 |
+
"11": "B-locality",
|
48 |
+
"12": "I-locality",
|
49 |
+
"13": "B-pincode",
|
50 |
+
"14": "I-pincode",
|
51 |
+
"15": "B-road",
|
52 |
+
"16": "I-road",
|
53 |
+
"17": "B-state",
|
54 |
+
"18": "I-state",
|
55 |
+
"19": "B-sub_locality",
|
56 |
+
"20": "I-sub_locality",
|
57 |
+
"21": "B-landmarks",
|
58 |
+
"22": "I-landmarks"
|
59 |
+
}
|
60 |
+
|
61 |
+
# Load default model (TinyBERT)
|
62 |
+
self.load_model("TinyBERT")
|
63 |
+
|
64 |
+
def load_model(self, model_key):
|
65 |
+
"""Load a specific model if not already loaded"""
|
66 |
+
if model_key not in self.loaded_models:
|
67 |
+
print(f"Loading {model_key} model...")
|
68 |
+
model_name = self.models_config[model_key]["name"]
|
69 |
+
|
70 |
+
try:
|
71 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
72 |
+
model = AutoModelForTokenClassification.from_pretrained(model_name)
|
73 |
+
model.to(self.device)
|
74 |
+
model.eval()
|
75 |
+
|
76 |
+
self.loaded_tokenizers[model_key] = tokenizer
|
77 |
+
self.loaded_models[model_key] = model
|
78 |
+
print(f"✅ {model_key} model loaded successfully!")
|
79 |
+
|
80 |
+
except Exception as e:
|
81 |
+
print(f"❌ Error loading {model_key}: {str(e)}")
|
82 |
+
raise e
|
83 |
+
|
84 |
+
return self.loaded_tokenizers[model_key], self.loaded_models[model_key]
|
85 |
+
|
86 |
+
def predict(self, address, model_key="TinyBERT"):
|
87 |
+
"""Extract entities from an Indian address using specified model"""
|
88 |
+
if not address.strip():
|
89 |
+
return {}, f"Using {model_key} model"
|
90 |
+
|
91 |
+
try:
|
92 |
+
# Load the selected model
|
93 |
+
tokenizer, model = self.load_model(model_key)
|
94 |
+
|
95 |
+
# Different approaches based on tokenizer type
|
96 |
+
if model_key == "IndicBERT":
|
97 |
+
# IndicBERT uses SentencePiece - use token-based approach
|
98 |
+
entities = self._predict_token_based(address, tokenizer, model)
|
99 |
+
else:
|
100 |
+
# TinyBERT and ModernBERT - use offset mapping approach
|
101 |
+
entities = self._predict_offset_based(address, tokenizer, model)
|
102 |
+
|
103 |
+
model_info = f"Using {model_key} ({self.models_config[model_key]['description']})"
|
104 |
+
return entities
|
105 |
+
|
106 |
+
def group_entities_sentencepiece(self, tokens, labels, confidences):
|
107 |
+
"""Group entities for SentencePiece tokenization (IndicBERT) with proper text reconstruction"""
|
108 |
+
entities = {}
|
109 |
+
current_entity = None
|
110 |
+
|
111 |
+
for i, (token, label, conf) in enumerate(zip(tokens, labels, confidences)):
|
112 |
+
if token in ["<s>", "</s>", "<pad>", "<unk>"]:
|
113 |
+
continue
|
114 |
+
|
115 |
+
if label.startswith("B-"):
|
116 |
+
# Save previous entity
|
117 |
+
if current_entity:
|
118 |
+
entity_type = current_entity["type"]
|
119 |
+
if entity_type not in entities:
|
120 |
+
entities[entity_type] = []
|
121 |
+
|
122 |
+
# Clean up the text by removing SentencePiece markers and extra spaces
|
123 |
+
clean_text = self._clean_sentencepiece_text(current_entity["text"])
|
124 |
+
entities[entity_type].append({
|
125 |
+
"text": clean_text,
|
126 |
+
"confidence": current_entity["confidence"]
|
127 |
+
})
|
128 |
+
|
129 |
+
# Start new entity - handle SentencePiece format
|
130 |
+
entity_type = label[2:] # Remove "B-"
|
131 |
+
clean_token = token.replace("▁", " ").strip()
|
132 |
+
current_entity = {
|
133 |
+
"type": entity_type,
|
134 |
+
"text": clean_token,
|
135 |
+
"confidence": conf
|
136 |
+
}
|
137 |
+
|
138 |
+
elif label.startswith("I-") and current_entity:
|
139 |
+
# Continue current entity
|
140 |
+
entity_type = label[2:] # Remove "I-"
|
141 |
+
if entity_type == current_entity["type"]:
|
142 |
+
# Handle SentencePiece subword continuation
|
143 |
+
if token.startswith("▁"):
|
144 |
+
# New word boundary
|
145 |
+
current_entity["text"] += " " + token.replace("▁", "")
|
146 |
+
else:
|
147 |
+
# Subword continuation
|
148 |
+
current_entity["text"] += token
|
149 |
+
current_entity["confidence"] = (current_entity["confidence"] + conf) / 2
|
150 |
+
|
151 |
+
elif label == "O" and current_entity:
|
152 |
+
# End current entity
|
153 |
+
entity_type = current_entity["type"]
|
154 |
+
if entity_type not in entities:
|
155 |
+
entities[entity_type] = []
|
156 |
+
|
157 |
+
clean_text = self._clean_sentencepiece_text(current_entity["text"])
|
158 |
+
entities[entity_type].append({
|
159 |
+
"text": clean_text,
|
160 |
+
"confidence": current_entity["confidence"]
|
161 |
+
})
|
162 |
+
current_entity = None
|
163 |
+
|
164 |
+
# Add final entity if exists
|
165 |
+
if current_entity:
|
166 |
+
entity_type = current_entity["type"]
|
167 |
+
if entity_type not in entities:
|
168 |
+
entities[entity_type] = []
|
169 |
+
|
170 |
+
clean_text = self._clean_sentencepiece_text(current_entity["text"])
|
171 |
+
entities[entity_type].append({
|
172 |
+
"text": clean_text,
|
173 |
+
"confidence": current_entity["confidence"]
|
174 |
+
})
|
175 |
+
|
176 |
+
return entities
|
177 |
+
|
178 |
+
def _clean_sentencepiece_text(self, text):
|
179 |
+
"""Clean SentencePiece text by removing markers and fixing spacing"""
|
180 |
+
# Remove SentencePiece markers
|
181 |
+
clean_text = text.replace("▁", " ")
|
182 |
+
# Remove extra spaces and clean up
|
183 |
+
clean_text = " ".join(clean_text.split())
|
184 |
+
# Remove trailing commas and spaces
|
185 |
+
clean_text = clean_text.strip().rstrip(",").strip()
|
186 |
+
return clean_text, model_info
|
187 |
+
|
188 |
+
except Exception as e:
|
189 |
+
return {}, f"Error with {model_key}: {str(e)}"
|
190 |
+
|
191 |
+
def _predict_offset_based(self, address, tokenizer, model):
|
192 |
+
"""Offset-based prediction for TinyBERT and ModernBERT"""
|
193 |
+
inputs = tokenizer(
|
194 |
+
address,
|
195 |
+
return_tensors="pt",
|
196 |
+
truncation=True,
|
197 |
+
padding=True,
|
198 |
+
max_length=128,
|
199 |
+
return_offsets_mapping=True
|
200 |
+
)
|
201 |
+
|
202 |
+
# Extract offset mapping before moving to device
|
203 |
+
offset_mapping = inputs.pop("offset_mapping")[0]
|
204 |
+
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
205 |
+
|
206 |
+
# Predict
|
207 |
+
with torch.no_grad():
|
208 |
+
outputs = model(**inputs)
|
209 |
+
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
|
210 |
+
predicted_ids = torch.argmax(predictions, dim=-1)
|
211 |
+
confidence_scores = torch.max(predictions, dim=-1)[0]
|
212 |
+
|
213 |
+
# Extract entities using offset mapping
|
214 |
+
return self.extract_entities_with_offsets(
|
215 |
+
address,
|
216 |
+
predicted_ids[0],
|
217 |
+
confidence_scores[0],
|
218 |
+
offset_mapping
|
219 |
+
)
|
220 |
+
|
221 |
+
def _predict_token_based(self, address, tokenizer, model):
|
222 |
+
"""Token-based prediction for IndicBERT (SentencePiece)"""
|
223 |
+
inputs = tokenizer(
|
224 |
+
address,
|
225 |
+
return_tensors="pt",
|
226 |
+
truncation=True,
|
227 |
+
padding=True,
|
228 |
+
max_length=128
|
229 |
+
)
|
230 |
+
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
231 |
+
|
232 |
+
# Predict
|
233 |
+
with torch.no_grad():
|
234 |
+
outputs = model(**inputs)
|
235 |
+
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
|
236 |
+
predicted_ids = torch.argmax(predictions, dim=-1)
|
237 |
+
confidence_scores = torch.max(predictions, dim=-1)[0]
|
238 |
+
|
239 |
+
# Convert to tokens and labels
|
240 |
+
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
|
241 |
+
predicted_labels = [self.id2entity.get(str(id.item()), "O") for id in predicted_ids[0]]
|
242 |
+
confidences = confidence_scores[0].cpu().numpy()
|
243 |
+
|
244 |
+
# Group entities with proper text reconstruction
|
245 |
+
return self.group_entities_sentencepiece(tokens, predicted_labels, confidences)
|
246 |
+
|
247 |
+
def extract_entities_with_offsets(self, original_text, predicted_ids, confidences, offset_mapping):
|
248 |
+
"""Extract entities using offset mapping for accurate text reconstruction"""
|
249 |
+
entities = {}
|
250 |
+
current_entity = None
|
251 |
+
|
252 |
+
for i, (pred_id, conf) in enumerate(zip(predicted_ids, confidences)):
|
253 |
+
if i >= len(offset_mapping):
|
254 |
+
break
|
255 |
+
|
256 |
+
start, end = offset_mapping[i]
|
257 |
+
|
258 |
+
# Skip special tokens (they have (0,0) mapping)
|
259 |
+
if start == end == 0:
|
260 |
+
continue
|
261 |
+
|
262 |
+
label = self.id2entity.get(str(pred_id.item()), "O")
|
263 |
+
|
264 |
+
if label.startswith("B-"):
|
265 |
+
# Save previous entity
|
266 |
+
if current_entity:
|
267 |
+
entity_type = current_entity["type"]
|
268 |
+
if entity_type not in entities:
|
269 |
+
entities[entity_type] = []
|
270 |
+
entities[entity_type].append({
|
271 |
+
"text": current_entity["text"],
|
272 |
+
"confidence": current_entity["confidence"]
|
273 |
+
})
|
274 |
+
|
275 |
+
# Start new entity
|
276 |
+
entity_type = label[2:] # Remove "B-"
|
277 |
+
current_entity = {
|
278 |
+
"type": entity_type,
|
279 |
+
"text": original_text[start:end],
|
280 |
+
"confidence": conf.item(),
|
281 |
+
"start": start,
|
282 |
+
"end": end
|
283 |
+
}
|
284 |
+
|
285 |
+
elif label.startswith("I-") and current_entity:
|
286 |
+
# Continue current entity
|
287 |
+
entity_type = label[2:] # Remove "I-"
|
288 |
+
if entity_type == current_entity["type"]:
|
289 |
+
# Extend the entity to include this token
|
290 |
+
current_entity["text"] = original_text[current_entity["start"]:end]
|
291 |
+
current_entity["confidence"] = (current_entity["confidence"] + conf.item()) / 2
|
292 |
+
current_entity["end"] = end
|
293 |
+
|
294 |
+
elif label == "O" and current_entity:
|
295 |
+
# End current entity
|
296 |
+
entity_type = current_entity["type"]
|
297 |
+
if entity_type not in entities:
|
298 |
+
entities[entity_type] = []
|
299 |
+
entities[entity_type].append({
|
300 |
+
"text": current_entity["text"],
|
301 |
+
"confidence": current_entity["confidence"]
|
302 |
+
})
|
303 |
+
current_entity = None
|
304 |
+
|
305 |
+
# Add final entity if exists
|
306 |
+
if current_entity:
|
307 |
+
entity_type = current_entity["type"]
|
308 |
+
if entity_type not in entities:
|
309 |
+
entities[entity_type] = []
|
310 |
+
entities[entity_type].append({
|
311 |
+
"text": current_entity["text"],
|
312 |
+
"confidence": current_entity["confidence"]
|
313 |
+
})
|
314 |
+
|
315 |
+
return entities
|
316 |
+
|
317 |
+
# Initialize the multi-model system
|
318 |
+
print("Initializing Multi-Model Indian Address NER...")
|
319 |
+
ner_system = MultiModelIndianAddressNER()
|
320 |
+
print("System ready!")
|
321 |
+
|
322 |
+
def process_address(address_text, selected_model):
|
323 |
+
"""Process address and return formatted results with selected model"""
|
324 |
+
if not address_text.strip():
|
325 |
+
return "Please enter an address to analyze."
|
326 |
+
|
327 |
+
try:
|
328 |
+
# Extract entities using selected model
|
329 |
+
entities, model_info = ner_system.predict(address_text, selected_model)
|
330 |
+
|
331 |
+
if not entities:
|
332 |
+
return f"❌ No entities found in the provided address.\n\n**{model_info}**"
|
333 |
+
|
334 |
+
# Format results
|
335 |
+
result = f"📍 **Input Address:** {address_text}\n\n"
|
336 |
+
result += f"🤖 **{model_info}**\n\n"
|
337 |
+
result += "🏷️ **Extracted Entities:**\n\n"
|
338 |
+
|
339 |
+
# Sort entities by type for better presentation
|
340 |
+
entity_order = [
|
341 |
+
'building_name', 'floor', 'house_details', 'road',
|
342 |
+
'sub_locality', 'locality', 'landmarks', 'city',
|
343 |
+
'state', 'country', 'pincode'
|
344 |
+
]
|
345 |
+
|
346 |
+
displayed_entities = set()
|
347 |
+
|
348 |
+
# Display entities in order
|
349 |
+
for entity_type in entity_order:
|
350 |
+
if entity_type in entities and entity_type not in displayed_entities:
|
351 |
+
result += f"**{entity_type.replace('_', ' ').title()}:**\n"
|
352 |
+
for entity in entities[entity_type]:
|
353 |
+
confidence = entity['confidence']
|
354 |
+
text = entity['text']
|
355 |
+
confidence_icon = "🟢" if confidence > 0.8 else "🟡" if confidence > 0.6 else "🔴"
|
356 |
+
result += f" {confidence_icon} {text} (confidence: {confidence:.3f})\n"
|
357 |
+
result += "\n"
|
358 |
+
displayed_entities.add(entity_type)
|
359 |
+
|
360 |
+
# Display any remaining entities
|
361 |
+
for entity_type, entity_list in entities.items():
|
362 |
+
if entity_type not in displayed_entities:
|
363 |
+
result += f"**{entity_type.replace('_', ' ').title()}:**\n"
|
364 |
+
for entity in entity_list:
|
365 |
+
confidence = entity['confidence']
|
366 |
+
text = entity['text']
|
367 |
+
confidence_icon = "🟢" if confidence > 0.8 else "🟡" if confidence > 0.6 else "🔴"
|
368 |
+
result += f" {confidence_icon} {text} (confidence: {confidence:.3f})\n"
|
369 |
+
result += "\n"
|
370 |
+
|
371 |
+
result += "\n**Legend:**\n"
|
372 |
+
result += "🟢 High confidence (>0.8)\n"
|
373 |
+
result += "🟡 Medium confidence (0.6-0.8)\n"
|
374 |
+
result += "🔴 Low confidence (<0.6)\n"
|
375 |
+
|
376 |
+
return result
|
377 |
+
|
378 |
+
except Exception as e:
|
379 |
+
return f"❌ Error processing address: {str(e)}"
|
380 |
+
|
381 |
+
def compare_models(address_text):
|
382 |
+
"""Compare results from all models"""
|
383 |
+
if not address_text.strip():
|
384 |
+
return "Please enter an address to compare models."
|
385 |
+
|
386 |
+
result = f"📍 **Address:** {address_text}\n\n"
|
387 |
+
result += "🔄 **Model Comparison:**\n\n"
|
388 |
+
|
389 |
+
for model_key in ner_system.models_config.keys():
|
390 |
+
try:
|
391 |
+
entities, model_info = ner_system.predict(address_text, model_key)
|
392 |
+
result += f"### {model_key}\n"
|
393 |
+
result += f"*{ner_system.models_config[model_key]['description']}*\n\n"
|
394 |
+
|
395 |
+
if entities:
|
396 |
+
entity_count = sum(len(entity_list) for entity_list in entities.values())
|
397 |
+
result += f"**Found {entity_count} entities:**\n"
|
398 |
+
|
399 |
+
for entity_type, entity_list in sorted(entities.items()):
|
400 |
+
for entity in entity_list:
|
401 |
+
confidence = entity['confidence']
|
402 |
+
text = entity['text']
|
403 |
+
confidence_icon = "🟢" if confidence > 0.8 else "🟡" if confidence > 0.6 else "🔴"
|
404 |
+
result += f" {confidence_icon} {entity_type}: {text} ({confidence:.3f})\n"
|
405 |
+
else:
|
406 |
+
result += "❌ No entities found\n"
|
407 |
+
|
408 |
+
result += "\n---\n\n"
|
409 |
+
|
410 |
+
except Exception as e:
|
411 |
+
result += f"### {model_key}\n❌ Error: {str(e)}\n\n---\n\n"
|
412 |
+
|
413 |
+
return result
|
414 |
+
|
415 |
+
# Sample addresses for examples
|
416 |
+
sample_addresses = [
|
417 |
+
"Shop No 123, Sunshine Apartments, Andheri West, Mumbai, 400058",
|
418 |
+
"DLF Cyber City, Sector 25, Gurgaon, Haryana",
|
419 |
+
"Flat 201, MG Road, Bangalore, Karnataka, 560001",
|
420 |
+
"Phoenix Mall, Kurla West, Mumbai",
|
421 |
+
"House No 456, Green Park Extension, New Delhi, 110016",
|
422 |
+
"Office 302, Tech Park, Electronic City, Bangalore, Karnataka, 560100"
|
423 |
+
]
|
424 |
+
|
425 |
+
# Create Gradio interface
|
426 |
+
with gr.Blocks(title="Multi-Model Indian Address NER", theme=gr.themes.Soft()) as demo:
|
427 |
+
gr.Markdown("""
|
428 |
+
# 🏠 Multi-Model Indian Address Named Entity Recognition
|
429 |
+
|
430 |
+
Compare different transformer models for extracting components from Indian addresses. Choose between TinyBERT (fast), ModernBERT (modern), and IndicBERT (Indic-optimized).
|
431 |
+
|
432 |
+
**Supported entities:** Building Name, Floor, House Details, Road, Sub-locality, Locality, Landmarks, City, State, Country, Pincode
|
433 |
+
""")
|
434 |
+
|
435 |
+
with gr.Tab("Single Model Analysis"):
|
436 |
+
with gr.Row():
|
437 |
+
with gr.Column(scale=1):
|
438 |
+
model_dropdown = gr.Dropdown(
|
439 |
+
choices=list(ner_system.models_config.keys()),
|
440 |
+
value="TinyBERT",
|
441 |
+
label="Select Model",
|
442 |
+
info="Choose which model to use for entity extraction"
|
443 |
+
)
|
444 |
+
|
445 |
+
address_input = gr.Textbox(
|
446 |
+
label="Enter Indian Address",
|
447 |
+
placeholder="e.g., Shop No 123, Sunshine Apartments, Andheri West, Mumbai, 400058",
|
448 |
+
lines=3,
|
449 |
+
max_lines=5
|
450 |
+
)
|
451 |
+
|
452 |
+
submit_btn = gr.Button("🔍 Extract Entities", variant="primary")
|
453 |
+
|
454 |
+
gr.Markdown("### 📝 Sample Addresses (click to use):")
|
455 |
+
sample_buttons = []
|
456 |
+
for addr in sample_addresses:
|
457 |
+
btn = gr.Button(addr, size="sm")
|
458 |
+
btn.click(fn=lambda x=addr: x, outputs=address_input)
|
459 |
+
sample_buttons.append(btn)
|
460 |
+
|
461 |
+
with gr.Column(scale=1):
|
462 |
+
output_text = gr.Markdown(
|
463 |
+
label="Extracted Entities",
|
464 |
+
value="Select a model, enter an address, and click 'Extract Entities' to see the results."
|
465 |
+
)
|
466 |
+
|
467 |
+
# Event handlers for single model
|
468 |
+
submit_btn.click(
|
469 |
+
fn=process_address,
|
470 |
+
inputs=[address_input, model_dropdown],
|
471 |
+
outputs=output_text
|
472 |
+
)
|
473 |
+
|
474 |
+
address_input.submit(
|
475 |
+
fn=process_address,
|
476 |
+
inputs=[address_input, model_dropdown],
|
477 |
+
outputs=output_text
|
478 |
+
)
|
479 |
+
|
480 |
+
with gr.Tab("Model Comparison"):
|
481 |
+
with gr.Row():
|
482 |
+
with gr.Column(scale=1):
|
483 |
+
address_compare = gr.Textbox(
|
484 |
+
label="Enter Indian Address for Comparison",
|
485 |
+
placeholder="e.g., Shop No 123, Sunshine Apartments, Andheri West, Mumbai, 400058",
|
486 |
+
lines=3,
|
487 |
+
max_lines=5
|
488 |
+
)
|
489 |
+
|
490 |
+
compare_btn = gr.Button("🔄 Compare All Models", variant="secondary")
|
491 |
+
|
492 |
+
gr.Markdown("### 📝 Sample Addresses (click to use):")
|
493 |
+
sample_buttons_compare = []
|
494 |
+
for addr in sample_addresses:
|
495 |
+
btn = gr.Button(addr, size="sm")
|
496 |
+
btn.click(fn=lambda x=addr: x, outputs=address_compare)
|
497 |
+
sample_buttons_compare.append(btn)
|
498 |
+
|
499 |
+
with gr.Column(scale=1):
|
500 |
+
comparison_output = gr.Markdown(
|
501 |
+
label="Model Comparison Results",
|
502 |
+
value="Enter an address and click 'Compare All Models' to see how different models perform."
|
503 |
+
)
|
504 |
+
|
505 |
+
# Event handlers for comparison
|
506 |
+
compare_btn.click(
|
507 |
+
fn=compare_models,
|
508 |
+
inputs=address_compare,
|
509 |
+
outputs=comparison_output
|
510 |
+
)
|
511 |
+
|
512 |
+
address_compare.submit(
|
513 |
+
fn=compare_models,
|
514 |
+
inputs=address_compare,
|
515 |
+
outputs=comparison_output
|
516 |
+
)
|
517 |
+
|
518 |
+
with gr.Tab("Model Information"):
|
519 |
+
gr.Markdown("""
|
520 |
+
## 📊 Available Models
|
521 |
+
|
522 |
+
### TinyBERT
|
523 |
+
- **Base Model**: huawei-noah/TinyBERT_General_6L_768D
|
524 |
+
- **Model Size**: ~66.4M parameters
|
525 |
+
- **Advantages**: Fastest inference, lowest memory usage, mobile-friendly
|
526 |
+
- **Best for**: Real-time applications, edge deployment
|
527 |
+
|
528 |
+
### ModernBERT
|
529 |
+
- **Base Model**: Modern transformer architecture
|
530 |
+
- **Model Size**: ~599MB
|
531 |
+
- **Advantages**: Latest architectural improvements, balanced performance
|
532 |
+
- **Best for**: High-accuracy requirements with reasonable speed
|
533 |
+
|
534 |
+
### IndicBERT
|
535 |
+
- **Base Model**: Indic language optimized transformer
|
536 |
+
- **Model Size**: ~131MB
|
537 |
+
- **Advantages**: Optimized for Indian languages and contexts
|
538 |
+
- **Best for**: Mixed language addresses, regional Indian contexts
|
539 |
+
|
540 |
+
## 🎯 Entity Types Supported
|
541 |
+
|
542 |
+
All models can extract the following entities:
|
543 |
+
- **Building Name**: Apartment/building names
|
544 |
+
- **Floor**: Floor numbers and details
|
545 |
+
- **House Details**: House/flat numbers
|
546 |
+
- **Road**: Street and road names
|
547 |
+
- **Sub-locality**: Sector, block details
|
548 |
+
- **Locality**: Area, neighborhood names
|
549 |
+
- **Landmarks**: Notable nearby locations
|
550 |
+
- **City**: City names
|
551 |
+
- **State**: State names
|
552 |
+
- **Country**: Country names
|
553 |
+
- **Pincode**: Postal codes
|
554 |
+
""")
|
555 |
+
|
556 |
+
gr.Markdown("""
|
557 |
+
---
|
558 |
+
**Models:**
|
559 |
+
- [TinyBERT](https://huggingface.co/shiprocket-ai/open-tinybert-indian-address-ner) |
|
560 |
+
[ModernBERT](https://huggingface.co/shiprocket-ai/open-modernbert-indian-address-ner) |
|
561 |
+
[IndicBERT](https://huggingface.co/shiprocket-ai/open-indicbert-indian-address-ner)
|
562 |
+
|
563 |
+
**About:** These models are specifically trained on Indian address patterns and can handle various formats and styles common in Indian addresses.
|
564 |
+
""")
|
565 |
+
|
566 |
+
if __name__ == "__main__":
|
567 |
+
demo.launch()
|
readme_md.md
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Multi-Model Indian Address NER Demo
|
2 |
+
|
3 |
+
This is a Gradio-based demo that allows you to compare three different Indian Address NER models:
|
4 |
+
- [TinyBERT](https://huggingface.co/shiprocket-ai/open-tinybert-indian-address-ner) - Lightweight and fast
|
5 |
+
- [ModernBERT](https://huggingface.co/shiprocket-ai/open-modernbert-indian-address-ner) - Modern architecture
|
6 |
+
- [IndicBERT](https://huggingface.co/shiprocket-ai/open-indicbert-indian-address-ner) - Indic language optimized
|
7 |
+
|
8 |
+
## What it does
|
9 |
+
|
10 |
+
This application allows you to:
|
11 |
+
|
12 |
+
1. **Single Model Analysis**: Choose one model and extract entities from Indian addresses
|
13 |
+
2. **Model Comparison**: Compare how all three models perform on the same address
|
14 |
+
3. **Interactive Testing**: Use sample addresses or input your own
|
15 |
+
|
16 |
+
The models can identify:
|
17 |
+
|
18 |
+
- Building names
|
19 |
+
- Floor numbers
|
20 |
+
- House details
|
21 |
+
- Roads
|
22 |
+
- Sub-localities
|
23 |
+
- Localities
|
24 |
+
- Landmarks
|
25 |
+
- Cities
|
26 |
+
- States
|
27 |
+
- Countries
|
28 |
+
- Pincodes
|
29 |
+
|
30 |
+
## How to use
|
31 |
+
|
32 |
+
### Single Model Analysis
|
33 |
+
1. Select a model from the dropdown (TinyBERT, ModernBERT, or IndicBERT)
|
34 |
+
2. Enter an Indian address in the text box
|
35 |
+
3. Click "Extract Entities" or press Enter
|
36 |
+
4. View the extracted entities with confidence scores
|
37 |
+
|
38 |
+
### Model Comparison
|
39 |
+
1. Go to the "Model Comparison" tab
|
40 |
+
2. Enter an address
|
41 |
+
3. Click "Compare All Models"
|
42 |
+
4. See how each model performs on the same input
|
43 |
+
|
44 |
+
## Example addresses
|
45 |
+
|
46 |
+
- Shop No 123, Sunshine Apartments, Andheri West, Mumbai, 400058
|
47 |
+
- DLF Cyber City, Sector 25, Gurgaon, Haryana
|
48 |
+
- Flat 201, MG Road, Bangalore, Karnataka, 560001
|
49 |
+
|
50 |
+
## Model Information
|
51 |
+
|
52 |
+
### TinyBERT
|
53 |
+
- **Parameters**: ~66.4M
|
54 |
+
- **Advantages**: Fastest inference, lowest memory
|
55 |
+
- **Best for**: Real-time applications, mobile deployment
|
56 |
+
|
57 |
+
### ModernBERT
|
58 |
+
- **Parameters**: ~599MB model
|
59 |
+
- **Advantages**: Modern architecture, balanced performance
|
60 |
+
- **Best for**: High accuracy with reasonable speed
|
61 |
+
|
62 |
+
### IndicBERT
|
63 |
+
- **Parameters**: ~131MB model
|
64 |
+
- **Advantages**: Optimized for Indian languages/contexts
|
65 |
+
- **Best for**: Mixed language addresses, regional contexts
|
66 |
+
|
67 |
+
**Framework**: PyTorch + Transformers
|
requirements_txt (2).txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch>=1.9.0
|
2 |
+
transformers>=4.21.0
|
3 |
+
gradio>=4.0.0
|
4 |
+
numpy>=1.21.0
|
5 |
+
tokenizers>=0.13.0
|
6 |
+
sentencepiece>=0.1.99
|