Spaces:

shiprocket-ai
/

multi-model-indian-address-ner

Running

App Files Files Community

sajalmadan0909 commited on 27 days ago

Commit

8808792

verified ·

1 Parent(s): 8961a48

Update app.py

Browse files

Files changed (1) hide show

app.py +83 -83

app.py CHANGED Viewed

@@ -101,89 +101,7 @@ class MultiModelIndianAddressNER:
                 entities = self._predict_offset_based(address, tokenizer, model)
             model_info = f"Using {model_key} ({self.models_config[model_key]['description']})"
-            return entities
-    def group_entities_sentencepiece(self, tokens, labels, confidences):
-        """Group entities for SentencePiece tokenization (IndicBERT) with proper text reconstruction"""
-        entities = {}
-        current_entity = None
-        for i, (token, label, conf) in enumerate(zip(tokens, labels, confidences)):
-            if token in ["<s>", "</s>", "<pad>", "<unk>"]:
-                continue
-            if label.startswith("B-"):
-                # Save previous entity
-                if current_entity:
-                    entity_type = current_entity["type"]
-                    if entity_type not in entities:
-                        entities[entity_type] = []
-                    # Clean up the text by removing SentencePiece markers and extra spaces
-                    clean_text = self._clean_sentencepiece_text(current_entity["text"])
-                    entities[entity_type].append({
-                        "text": clean_text,
-                        "confidence": current_entity["confidence"]
-                    })
-                # Start new entity - handle SentencePiece format
-                entity_type = label[2:]  # Remove "B-"
-                clean_token = token.replace("▁", " ").strip()
-                current_entity = {
-                    "type": entity_type,
-                    "text": clean_token,
-                    "confidence": conf
-                }
-            elif label.startswith("I-") and current_entity:
-                # Continue current entity
-                entity_type = label[2:]  # Remove "I-"
-                if entity_type == current_entity["type"]:
-                    # Handle SentencePiece subword continuation
-                    if token.startswith("▁"):
-                        # New word boundary
-                        current_entity["text"] += " " + token.replace("▁", "")
-                    else:
-                        # Subword continuation
-                        current_entity["text"] += token
-                    current_entity["confidence"] = (current_entity["confidence"] + conf) / 2
-            elif label == "O" and current_entity:
-                # End current entity
-                entity_type = current_entity["type"]
-                if entity_type not in entities:
-                    entities[entity_type] = []
-                clean_text = self._clean_sentencepiece_text(current_entity["text"])
-                entities[entity_type].append({
-                    "text": clean_text,
-                    "confidence": current_entity["confidence"]
-                })
-                current_entity = None
-        # Add final entity if exists
-        if current_entity:
-            entity_type = current_entity["type"]
-            if entity_type not in entities:
-                entities[entity_type] = []
-            clean_text = self._clean_sentencepiece_text(current_entity["text"])
-            entities[entity_type].append({
-                "text": clean_text,
-                "confidence": current_entity["confidence"]
-            })
-        return entities
-    def _clean_sentencepiece_text(self, text):
-        """Clean SentencePiece text by removing markers and fixing spacing"""
-        # Remove SentencePiece markers
-        clean_text = text.replace("▁", " ")
-        # Remove extra spaces and clean up
-        clean_text = " ".join(clean_text.split())
-        # Remove trailing commas and spaces
-        clean_text = clean_text.strip().rstrip(",").strip()
-        return clean_text, model_info
         except Exception as e:
             return {}, f"Error with {model_key}: {str(e)}"
@@ -313,6 +231,88 @@ class MultiModelIndianAddressNER:
             })
         return entities
 # Initialize the multi-model system
 print("Initializing Multi-Model Indian Address NER...")

                 entities = self._predict_offset_based(address, tokenizer, model)
             model_info = f"Using {model_key} ({self.models_config[model_key]['description']})"
+            return entities, model_info
         except Exception as e:
             return {}, f"Error with {model_key}: {str(e)}"
             })
         return entities
+    def group_entities_sentencepiece(self, tokens, labels, confidences):
+        """Group entities for SentencePiece tokenization (IndicBERT) with proper text reconstruction"""
+        entities = {}
+        current_entity = None
+        for i, (token, label, conf) in enumerate(zip(tokens, labels, confidences)):
+            if token in ["<s>", "</s>", "<pad>", "<unk>"]:
+                continue
+            if label.startswith("B-"):
+                # Save previous entity
+                if current_entity:
+                    entity_type = current_entity["type"]
+                    if entity_type not in entities:
+                        entities[entity_type] = []
+                    # Clean up the text by removing SentencePiece markers and extra spaces
+                    clean_text = self._clean_sentencepiece_text(current_entity["text"])
+                    entities[entity_type].append({
+                        "text": clean_text,
+                        "confidence": current_entity["confidence"]
+                    })
+                # Start new entity - handle SentencePiece format
+                entity_type = label[2:]  # Remove "B-"
+                clean_token = token.replace("▁", " ").strip()
+                current_entity = {
+                    "type": entity_type,
+                    "text": clean_token,
+                    "confidence": conf
+                }
+            elif label.startswith("I-") and current_entity:
+                # Continue current entity
+                entity_type = label[2:]  # Remove "I-"
+                if entity_type == current_entity["type"]:
+                    # Handle SentencePiece subword continuation
+                    if token.startswith("▁"):
+                        # New word boundary
+                        current_entity["text"] += " " + token.replace("▁", "")
+                    else:
+                        # Subword continuation
+                        current_entity["text"] += token
+                    current_entity["confidence"] = (current_entity["confidence"] + conf) / 2
+            elif label == "O" and current_entity:
+                # End current entity
+                entity_type = current_entity["type"]
+                if entity_type not in entities:
+                    entities[entity_type] = []
+                clean_text = self._clean_sentencepiece_text(current_entity["text"])
+                entities[entity_type].append({
+                    "text": clean_text,
+                    "confidence": current_entity["confidence"]
+                })
+                current_entity = None
+        # Add final entity if exists
+        if current_entity:
+            entity_type = current_entity["type"]
+            if entity_type not in entities:
+                entities[entity_type] = []
+            clean_text = self._clean_sentencepiece_text(current_entity["text"])
+            entities[entity_type].append({
+                "text": clean_text,
+                "confidence": current_entity["confidence"]
+            })
+        return entities
+    def _clean_sentencepiece_text(self, text):
+        """Clean SentencePiece text by removing markers and fixing spacing"""
+        # Remove SentencePiece markers
+        clean_text = text.replace("▁", " ")
+        # Remove extra spaces and clean up
+        clean_text = " ".join(clean_text.split())
+        # Remove trailing commas and spaces
+        clean_text = clean_text.strip().rstrip(",").strip()
+        return clean_text
 # Initialize the multi-model system
 print("Initializing Multi-Model Indian Address NER...")