sajalmadan0909 commited on
Commit
8808792
Β·
verified Β·
1 Parent(s): 8961a48

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +83 -83
app.py CHANGED
@@ -101,89 +101,7 @@ class MultiModelIndianAddressNER:
101
  entities = self._predict_offset_based(address, tokenizer, model)
102
 
103
  model_info = f"Using {model_key} ({self.models_config[model_key]['description']})"
104
- return entities
105
-
106
- def group_entities_sentencepiece(self, tokens, labels, confidences):
107
- """Group entities for SentencePiece tokenization (IndicBERT) with proper text reconstruction"""
108
- entities = {}
109
- current_entity = None
110
-
111
- for i, (token, label, conf) in enumerate(zip(tokens, labels, confidences)):
112
- if token in ["<s>", "</s>", "<pad>", "<unk>"]:
113
- continue
114
-
115
- if label.startswith("B-"):
116
- # Save previous entity
117
- if current_entity:
118
- entity_type = current_entity["type"]
119
- if entity_type not in entities:
120
- entities[entity_type] = []
121
-
122
- # Clean up the text by removing SentencePiece markers and extra spaces
123
- clean_text = self._clean_sentencepiece_text(current_entity["text"])
124
- entities[entity_type].append({
125
- "text": clean_text,
126
- "confidence": current_entity["confidence"]
127
- })
128
-
129
- # Start new entity - handle SentencePiece format
130
- entity_type = label[2:] # Remove "B-"
131
- clean_token = token.replace("▁", " ").strip()
132
- current_entity = {
133
- "type": entity_type,
134
- "text": clean_token,
135
- "confidence": conf
136
- }
137
-
138
- elif label.startswith("I-") and current_entity:
139
- # Continue current entity
140
- entity_type = label[2:] # Remove "I-"
141
- if entity_type == current_entity["type"]:
142
- # Handle SentencePiece subword continuation
143
- if token.startswith("▁"):
144
- # New word boundary
145
- current_entity["text"] += " " + token.replace("▁", "")
146
- else:
147
- # Subword continuation
148
- current_entity["text"] += token
149
- current_entity["confidence"] = (current_entity["confidence"] + conf) / 2
150
-
151
- elif label == "O" and current_entity:
152
- # End current entity
153
- entity_type = current_entity["type"]
154
- if entity_type not in entities:
155
- entities[entity_type] = []
156
-
157
- clean_text = self._clean_sentencepiece_text(current_entity["text"])
158
- entities[entity_type].append({
159
- "text": clean_text,
160
- "confidence": current_entity["confidence"]
161
- })
162
- current_entity = None
163
-
164
- # Add final entity if exists
165
- if current_entity:
166
- entity_type = current_entity["type"]
167
- if entity_type not in entities:
168
- entities[entity_type] = []
169
-
170
- clean_text = self._clean_sentencepiece_text(current_entity["text"])
171
- entities[entity_type].append({
172
- "text": clean_text,
173
- "confidence": current_entity["confidence"]
174
- })
175
-
176
- return entities
177
-
178
- def _clean_sentencepiece_text(self, text):
179
- """Clean SentencePiece text by removing markers and fixing spacing"""
180
- # Remove SentencePiece markers
181
- clean_text = text.replace("▁", " ")
182
- # Remove extra spaces and clean up
183
- clean_text = " ".join(clean_text.split())
184
- # Remove trailing commas and spaces
185
- clean_text = clean_text.strip().rstrip(",").strip()
186
- return clean_text, model_info
187
 
188
  except Exception as e:
189
  return {}, f"Error with {model_key}: {str(e)}"
@@ -313,6 +231,88 @@ class MultiModelIndianAddressNER:
313
  })
314
 
315
  return entities
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
316
 
317
  # Initialize the multi-model system
318
  print("Initializing Multi-Model Indian Address NER...")
 
101
  entities = self._predict_offset_based(address, tokenizer, model)
102
 
103
  model_info = f"Using {model_key} ({self.models_config[model_key]['description']})"
104
+ return entities, model_info
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
  except Exception as e:
107
  return {}, f"Error with {model_key}: {str(e)}"
 
231
  })
232
 
233
  return entities
234
+
235
+ def group_entities_sentencepiece(self, tokens, labels, confidences):
236
+ """Group entities for SentencePiece tokenization (IndicBERT) with proper text reconstruction"""
237
+ entities = {}
238
+ current_entity = None
239
+
240
+ for i, (token, label, conf) in enumerate(zip(tokens, labels, confidences)):
241
+ if token in ["<s>", "</s>", "<pad>", "<unk>"]:
242
+ continue
243
+
244
+ if label.startswith("B-"):
245
+ # Save previous entity
246
+ if current_entity:
247
+ entity_type = current_entity["type"]
248
+ if entity_type not in entities:
249
+ entities[entity_type] = []
250
+
251
+ # Clean up the text by removing SentencePiece markers and extra spaces
252
+ clean_text = self._clean_sentencepiece_text(current_entity["text"])
253
+ entities[entity_type].append({
254
+ "text": clean_text,
255
+ "confidence": current_entity["confidence"]
256
+ })
257
+
258
+ # Start new entity - handle SentencePiece format
259
+ entity_type = label[2:] # Remove "B-"
260
+ clean_token = token.replace("▁", " ").strip()
261
+ current_entity = {
262
+ "type": entity_type,
263
+ "text": clean_token,
264
+ "confidence": conf
265
+ }
266
+
267
+ elif label.startswith("I-") and current_entity:
268
+ # Continue current entity
269
+ entity_type = label[2:] # Remove "I-"
270
+ if entity_type == current_entity["type"]:
271
+ # Handle SentencePiece subword continuation
272
+ if token.startswith("▁"):
273
+ # New word boundary
274
+ current_entity["text"] += " " + token.replace("▁", "")
275
+ else:
276
+ # Subword continuation
277
+ current_entity["text"] += token
278
+ current_entity["confidence"] = (current_entity["confidence"] + conf) / 2
279
+
280
+ elif label == "O" and current_entity:
281
+ # End current entity
282
+ entity_type = current_entity["type"]
283
+ if entity_type not in entities:
284
+ entities[entity_type] = []
285
+
286
+ clean_text = self._clean_sentencepiece_text(current_entity["text"])
287
+ entities[entity_type].append({
288
+ "text": clean_text,
289
+ "confidence": current_entity["confidence"]
290
+ })
291
+ current_entity = None
292
+
293
+ # Add final entity if exists
294
+ if current_entity:
295
+ entity_type = current_entity["type"]
296
+ if entity_type not in entities:
297
+ entities[entity_type] = []
298
+
299
+ clean_text = self._clean_sentencepiece_text(current_entity["text"])
300
+ entities[entity_type].append({
301
+ "text": clean_text,
302
+ "confidence": current_entity["confidence"]
303
+ })
304
+
305
+ return entities
306
+
307
+ def _clean_sentencepiece_text(self, text):
308
+ """Clean SentencePiece text by removing markers and fixing spacing"""
309
+ # Remove SentencePiece markers
310
+ clean_text = text.replace("▁", " ")
311
+ # Remove extra spaces and clean up
312
+ clean_text = " ".join(clean_text.split())
313
+ # Remove trailing commas and spaces
314
+ clean_text = clean_text.strip().rstrip(",").strip()
315
+ return clean_text
316
 
317
  # Initialize the multi-model system
318
  print("Initializing Multi-Model Indian Address NER...")