File size: 2,821 Bytes
d5d2a07 c619232 d5d2a07 5436b2b d5d2a07 dd1b5ba 49dd9a6 53e96e8 dd1b5ba 5b5c00d 49dd9a6 53e96e8 dd1b5ba 2dac3ae 53e96e8 49dd9a6 d5d2a07 5436b2b dd1b5ba 464c568 49dd9a6 5436b2b d5d2a07 5436b2b d20062f d5d2a07 5436b2b 464c568 ac886a9 d5d2a07 5436b2b ac886a9 d5d2a07 5436b2b d5d2a07 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
import gradio as gr
from transformers import pipeline, AutoTokenizer
# Define the model name
MODEL_NAME = "impresso-project/ner-stacked-bert-multilingual"
# Load the tokenizer and model using the pipeline
ner_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
ner_pipeline = pipeline(
"generic-ner",
model=MODEL_NAME,
tokenizer=ner_tokenizer,
trust_remote_code=True,
device="cpu",
)
# Helper function to align entities correctly and debug tokenization
def prepare_entities_for_highlight(text, results):
entities = []
seen_spans = set() # Track the spans we have already added to avoid overlaps
# Print debug info about tokenization
print(f"Original text: {text}")
# it should look like:
# [{'entity': 'org.ent.pressagency.Reuters', 'score': np.float32(98.47), 'index': 78, 'word': 'Reuters', 'start': 440, 'end': 447}]
for category, entity_list in results.items():
for entity in entity_list:
entity_span = (entity["start"], entity["end"])
# Only add non-overlapping entities
if entity_span not in seen_spans:
seen_spans.add(entity_span)
entity_text = text[
entity["start"] : entity["end"]
].strip() # Ensure we're working with the correct portion of the text
entity["text"] = entity_text
entity.pop("word")
print(f"Entity text: {entity}")
entities.append(entity)
# Sort entities by their start position
entities = sorted(entities, key=lambda x: x["start"])
return {"text": text, "entities": entities}
# Function to process the sentence and extract entities
def extract_entities(sentence):
results = ner_pipeline(sentence)
# Debugging the result format
print(f"NER results: {results}")
# Format the results for HighlightedText
return prepare_entities_for_highlight(sentence, results)
# Create Gradio interface
def ner_app_interface():
input_sentence = gr.Textbox(
lines=5, label="Input Sentence", placeholder="Enter a sentence for NER..."
)
output_entities = gr.HighlightedText(label="Extracted Entities")
# Interface definition
interface = gr.Interface(
fn=extract_entities,
inputs=input_sentence,
outputs=output_entities,
title="Named Entity Recognition",
description="Enter a sentence to extract named entities using the NER model from the Impresso project.",
examples=[
[
"In the year 1789, King Louis XVI, ruler of France, convened the Estates-General at the Palace of Versailles."
]
],
live=False,
)
interface.launch(share=True)
# Run the app
if __name__ == "__main__":
ner_app_interface()
|