emanuelaboros's picture
app
5b5c00d
raw
history blame
3.15 kB
import gradio as gr
from transformers import pipeline, AutoTokenizer
# Define the model name
MODEL_NAME = "impresso-project/ner-stacked-bert-multilingual"
# Load the tokenizer and model using the pipeline
ner_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
ner_pipeline = pipeline(
"generic-ner",
model=MODEL_NAME,
tokenizer=ner_tokenizer,
trust_remote_code=True,
device="cpu",
)
# Helper function to align entities correctly and debug tokenization
def prepare_entities_for_highlight(text, results):
entities = []
seen_spans = set() # Track the spans we have already added to avoid overlaps
# Print debug info about tokenization
print(f"Original text: {text}")
# it should look like:
# [{'entity': 'org.ent.pressagency.Reuters', 'score': np.float32(98.47), 'index': 78, 'word': 'Reuters', 'start': 440, 'end': 447}]
for category, entity_list in results.items():
for entity in entity_list:
entity_span = (entity["start"], entity["end"])
# Only add non-overlapping entities
if entity_span not in seen_spans:
seen_spans.add(entity_span)
entity_text = text[
entity["start"] : entity["end"]
].strip() # Ensure we're working with the correct portion of the text
print(
f"Entity text: {entity_text}, Start: {entity['start']}, End: {entity['end']}, Type: {entity['entity']}"
)
entities.append(
{
"text": entity_text,
"score": entity["score"],
"start": entity["start"],
"end": entity["end"],
"label": entity["entity"],
}
)
# Sort entities by their start position
entities = sorted(entities, key=lambda x: x["start"])
return {"text": text, "entities": entities}
# Function to process the sentence and extract entities
def extract_entities(sentence):
results = ner_pipeline(sentence)
# Debugging the result format
print(f"NER results: {results}")
# Format the results for HighlightedText
return prepare_entities_for_highlight(sentence, results)
# Create Gradio interface
def ner_app_interface():
input_sentence = gr.Textbox(
lines=5, label="Input Sentence", placeholder="Enter a sentence for NER..."
)
output_entities = gr.HighlightedText(label="Extracted Entities")
# Interface definition
interface = gr.Interface(
fn=extract_entities,
inputs=input_sentence,
outputs=output_entities,
title="Named Entity Recognition",
description="Enter a sentence to extract named entities using the NER model from the Impresso project.",
examples=[
[
"In the year 1789, King Louis XVI, ruler of France, convened the Estates-General at the Palace of Versailles."
]
],
live=False,
)
interface.launch(share=True)
# Run the app
if __name__ == "__main__":
ner_app_interface()