File size: 5,169 Bytes
d5d2a07 c619232 d5d2a07 5436b2b d5d2a07 9df3a2f 563c6f6 7e788d5 9df3a2f 7e788d5 9df3a2f 7e788d5 9df3a2f 7e788d5 9df3a2f e9c0cd2 49dd9a6 53e96e8 dd1b5ba e9c0cd2 bf5bc24 5b5c00d 69a7750 09dc6c7 c2d3af5 d6100e0 e9c0cd2 d6100e0 b3b05b5 bf5f745 69a7750 8726584 d6100e0 8726584 53e96e8 8726584 0300379 d5d2a07 69a7750 d5d2a07 5436b2b a836d61 5436b2b 9df3a2f d5d2a07 e9c0cd2 d5d2a07 5436b2b 464c568 a836d61 464c568 ac886a9 d5d2a07 5436b2b ac886a9 d5d2a07 5436b2b d5d2a07 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
import gradio as gr
from transformers import pipeline, AutoTokenizer
# Define the model name
MODEL_NAME = "impresso-project/ner-stacked-bert-multilingual"
# Load the tokenizer and model using the pipeline
ner_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
ner_pipeline = pipeline(
"generic-ner",
model=MODEL_NAME,
tokenizer=ner_tokenizer,
trust_remote_code=True,
device="cpu",
)
def format_entities_as_html(entities):
excluded_keys = {"start", "end", "index"} # Keys to exclude from the output
html_output = "<div>"
for entity in entities:
html_output += (
"<div style='margin-bottom: 10px;'>" # Each entity in a separate div
)
# Dynamically add all fields except the excluded ones
for key, value in entity.items():
if key not in excluded_keys:
if isinstance(value, float): # Format score if it's a float
html_output += (
f"<strong>{key.capitalize()}:</strong> {value:.2f}<br>"
)
else:
html_output += f"<strong>{key.capitalize()}:</strong> {value}<br>"
html_output += "</div>"
html_output += "</div>"
return html_output
# Function to process the sentence and extract entities
def extract_entities(sentence):
results = ner_pipeline(sentence)
# Debugging the result format
print(f"NER results: {results}")
entities = []
seen_spans = set() # Track the spans we have already added to avoid overlaps
# Print debug info about tokenization
print(f"Original text: {sentence}")
print("Results:", results)
# it should look like:
# [{'entity': 'org.ent.pressagency.Reuters', 'score': np.float32(98.47),
# 'index': 78, 'text': 'Reuters', 'start': 440, 'end': 447}]
for entity in results:
entity["start"] = entity["lOffset"]
entity["end"] = entity["rOffset"]
entity_span = (entity["start"], entity["end"])
# Only add non-overlapping entities
if entity_span not in seen_spans:
seen_spans.add(entity_span)
entity_text = sentence[
entity["start"] : entity["end"]
].strip() # Ensure we're working with the correct portion of the text
entity["surface"] = entity_text
label = f"{entity['type']}"
if "title" in entity:
label += f" - Title: {entity['title']}"
if "name" in entity:
label += f" - Name: {entity['name']}"
if "function" in entity:
label += f" - Function: {entity['function']}"
entity["entity"] = label
# print(f"Entity text: {entity}")
entities.append(entity)
print(f"Entities: {entities}")
# Sort entities by their start position
# entities = sorted(entities, key=lambda x: x["start"])
return {"text": sentence, "entities": entities}
# Create Gradio interface
def ner_app_interface():
input_sentence = gr.Textbox(
lines=5, label="Input Sentence", placeholder="Enter a sentence for NER:"
)
output_entities = gr.HTML(label="Extracted Entities")
# Interface definition
interface = gr.Interface(
fn=extract_entities,
inputs=input_sentence,
outputs=[gr.HighlightedText(label="Text with mentions")],
# outputs=output_entities,
title="Named Entity Recognition",
description="Enter a sentence to extract named entities using the NER model from the Impresso project.",
examples=[
[
"Des chercheurs de l'Université de Cambridge ont développé une nouvelle technique de calcul quantique qui promet d'augmenter exponentiellement les vitesses de calcul."
],
[
"Le rapport complet sur ces découvertes a été publié dans la prestigieuse revue 'Nature Physics'. (Reuters)"
],
["In the year 1789, the Estates-General was convened in France."],
[
"The event was held at the Palace of Versailles, a symbol of French monarchy."
],
[
"At Versailles, Marie Antoinette, the Queen of France, was involved in discussions."
],
[
"Maximilien Robespierre, a leading member of the National Assembly, also participated."
],
[
"Jean-Jacques Rousseau, the famous philosopher, was a significant figure in the debate."
],
[
"Another important participant was Charles de Talleyrand, the Bishop of Autun."
],
[
"Meanwhile, across the Atlantic, George Washington, the first President of the United States, was shaping policies."
],
[
"Thomas Jefferson, the nation's Secretary of State, played a key role in drafting policies for the new American government."
],
],
live=False,
)
interface.launch(share=True)
# Run the app
if __name__ == "__main__":
ner_app_interface()
|