Spaces:

impresso-project
/

multilingual-named-entity-recognition

Running

emanuelaboros commited on Oct 17, 2024

Commit

dd1b5ba

1 Parent(s): 53e96e8

update app

Files changed (1) hide show

app.py CHANGED Viewed

@@ -16,11 +16,14 @@ ner_pipeline = pipeline(
 )
-# Helper function to flatten entities and prepare them for HighlightedText
 def prepare_entities_for_highlight(text, results):
     entities = []
     seen_spans = set()  # Track the spans we have already added to avoid overlaps
     for category, entity_list in results.items():
         for entity in entity_list:
             entity_span = (entity["start"], entity["end"])
@@ -28,6 +31,12 @@ def prepare_entities_for_highlight(text, results):
             # Only add non-overlapping entities
             if entity_span not in seen_spans:
                 seen_spans.add(entity_span)
                 entities.append(
                     {
                         "start": entity["start"],
@@ -46,6 +55,9 @@ def prepare_entities_for_highlight(text, results):
 def extract_entities(sentence):
     results = ner_pipeline(sentence)
     # Format the results for HighlightedText
     return prepare_entities_for_highlight(sentence, results)

 )
+# Helper function to align entities correctly and debug tokenization
 def prepare_entities_for_highlight(text, results):
     entities = []
     seen_spans = set()  # Track the spans we have already added to avoid overlaps
+    # Print debug info about tokenization
+    print(f"Original text: {text}")
     for category, entity_list in results.items():
         for entity in entity_list:
             entity_span = (entity["start"], entity["end"])
             # Only add non-overlapping entities
             if entity_span not in seen_spans:
                 seen_spans.add(entity_span)
+                entity_text = text[
+                    entity["start"] : entity["end"]
+                ].strip()  # Ensure we're working with the correct portion of the text
+                print(
+                    f"Entity text: {entity_text}, Start: {entity['start']}, End: {entity['end']}, Type: {entity['entity']}"
+                )
                 entities.append(
                     {
                         "start": entity["start"],
 def extract_entities(sentence):
     results = ner_pipeline(sentence)
+    # Debugging the result format
+    print(f"NER results: {results}")
     # Format the results for HighlightedText
     return prepare_entities_for_highlight(sentence, results)