Spaces:

impresso-project
/

multilingual-named-entity-recognition

Running

App Files Files Community

emanuelaboros commited on Oct 18, 2024

Commit

e9c0cd2

1 Parent(s): b3b05b5

lets see how the hihgt works

Browse files

Files changed (1) hide show

app.py +13 -18

app.py CHANGED Viewed

@@ -41,13 +41,18 @@ def format_entities_as_html(entities):
     return html_output
-# Helper function to align entities correctly and debug tokenization
-def prepare_entities_for_highlight(text, results):
     entities = []
     seen_spans = set()  # Track the spans we have already added to avoid overlaps
     # Print debug info about tokenization
-    print(f"Original text: {text}")
     print("Results:", results)
     # it should look like:
     # [{'entity': 'org.ent.pressagency.Reuters', 'score': np.float32(98.47), 'index': 78, 'text': 'Reuters', 'start': 440, 'end': 447}]
@@ -57,29 +62,18 @@ def prepare_entities_for_highlight(text, results):
         # Only add non-overlapping entities
         if entity_span not in seen_spans:
             seen_spans.add(entity_span)
-            entity_text = text[
                 entity["start"] : entity["end"]
             ].strip()  # Ensure we're working with the correct portion of the text
             entity["surface"] = entity_text
             print(f"Entity text: {entity}")
             entities.append(entity)
     # Sort entities by their start position
     entities = sorted(entities, key=lambda x: x["start"])
-    return format_entities_as_html(entities)
-# Function to process the sentence and extract entities
-def extract_entities(sentence):
-    results = ner_pipeline(sentence)
-    # Debugging the result format
-    print(f"NER results: {results}")
-    # Format the results for HighlightedText
-    return prepare_entities_for_highlight(sentence, results)
 # Create Gradio interface
@@ -93,7 +87,8 @@ def ner_app_interface():
     interface = gr.Interface(
         fn=extract_entities,
         inputs=input_sentence,
-        outputs=output_entities,
         title="Named Entity Recognition",
         description="Enter a sentence to extract named entities using the NER model from the Impresso project.",
         examples=[

     return html_output
+# Function to process the sentence and extract entities
+def extract_entities(sentence):
+    results = ner_pipeline(sentence)
+    # Debugging the result format
+    print(f"NER results: {results}")
     entities = []
     seen_spans = set()  # Track the spans we have already added to avoid overlaps
     # Print debug info about tokenization
+    print(f"Original text: {sentence}")
     print("Results:", results)
     # it should look like:
     # [{'entity': 'org.ent.pressagency.Reuters', 'score': np.float32(98.47), 'index': 78, 'text': 'Reuters', 'start': 440, 'end': 447}]
         # Only add non-overlapping entities
         if entity_span not in seen_spans:
             seen_spans.add(entity_span)
+            entity_text = sentence[
                 entity["start"] : entity["end"]
             ].strip()  # Ensure we're working with the correct portion of the text
             entity["surface"] = entity_text
+            entity["entity"] = entity["type"]
             print(f"Entity text: {entity}")
             entities.append(entity)
     # Sort entities by their start position
     entities = sorted(entities, key=lambda x: x["start"])
+    return entities
 # Create Gradio interface
     interface = gr.Interface(
         fn=extract_entities,
         inputs=input_sentence,
+        outputs=[gr.HighlightedText(label="Text with  mentions")],
+        # outputs=output_entities,
         title="Named Entity Recognition",
         description="Enter a sentence to extract named entities using the NER model from the Impresso project.",
         examples=[