Spaces:

impresso-project
/

multilingual-named-entity-recognition

Running

App Files Files Community

multilingual-named-entity-recognition / app.py

emanuelaboros

update index

c2d3af5 9 months ago

raw

history blame contribute delete

5.17 kB

	import gradio as gr
	from transformers import pipeline, AutoTokenizer

	# Define the model name
	MODEL_NAME = "impresso-project/ner-stacked-bert-multilingual"

	# Load the tokenizer and model using the pipeline
	ner_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

	ner_pipeline = pipeline(
	"generic-ner",
	model=MODEL_NAME,
	tokenizer=ner_tokenizer,
	trust_remote_code=True,
	device="cpu",
	)


	def format_entities_as_html(entities):
	excluded_keys = {"start", "end", "index"} # Keys to exclude from the output
	html_output = "<div>"

	for entity in entities:
	html_output += (
	"<div style='margin-bottom: 10px;'>" # Each entity in a separate div
	)

	# Dynamically add all fields except the excluded ones
	for key, value in entity.items():
	if key not in excluded_keys:
	if isinstance(value, float): # Format score if it's a float
	html_output += (
	f"<strong>{key.capitalize()}:</strong> {value:.2f}<br>"
	)
	else:
	html_output += f"<strong>{key.capitalize()}:</strong> {value}<br>"

	html_output += "</div>"

	html_output += "</div>"
	return html_output


	# Function to process the sentence and extract entities
	def extract_entities(sentence):
	results = ner_pipeline(sentence)

	# Debugging the result format
	print(f"NER results: {results}")

	entities = []
	seen_spans = set() # Track the spans we have already added to avoid overlaps

	# Print debug info about tokenization
	print(f"Original text: {sentence}")
	print("Results:", results)
	# it should look like:
	# [{'entity': 'org.ent.pressagency.Reuters', 'score': np.float32(98.47),
	# 'index': 78, 'text': 'Reuters', 'start': 440, 'end': 447}]
	for entity in results:
	entity["start"] = entity["lOffset"]
	entity["end"] = entity["rOffset"]
	entity_span = (entity["start"], entity["end"])

	# Only add non-overlapping entities
	if entity_span not in seen_spans:
	seen_spans.add(entity_span)
	entity_text = sentence[
	entity["start"] : entity["end"]
	].strip() # Ensure we're working with the correct portion of the text
	entity["surface"] = entity_text
	label = f"{entity['type']}"
	if "title" in entity:
	label += f" - Title: {entity['title']}"
	if "name" in entity:
	label += f" - Name: {entity['name']}"
	if "function" in entity:
	label += f" - Function: {entity['function']}"
	entity["entity"] = label
	# print(f"Entity text: {entity}")

	entities.append(entity)
	print(f"Entities: {entities}")
	# Sort entities by their start position
	# entities = sorted(entities, key=lambda x: x["start"])
	return {"text": sentence, "entities": entities}


	# Create Gradio interface
	def ner_app_interface():
	input_sentence = gr.Textbox(
	lines=5, label="Input Sentence", placeholder="Enter a sentence for NER:"
	)
	output_entities = gr.HTML(label="Extracted Entities")

	# Interface definition
	interface = gr.Interface(
	fn=extract_entities,
	inputs=input_sentence,
	outputs=[gr.HighlightedText(label="Text with mentions")],
	# outputs=output_entities,
	title="Named Entity Recognition",
	description="Enter a sentence to extract named entities using the NER model from the Impresso project.",
	examples=[
	[
	"Des chercheurs de l'Université de Cambridge ont développé une nouvelle technique de calcul quantique qui promet d'augmenter exponentiellement les vitesses de calcul."
	],
	[
	"Le rapport complet sur ces découvertes a été publié dans la prestigieuse revue 'Nature Physics'. (Reuters)"
	],
	["In the year 1789, the Estates-General was convened in France."],
	[
	"The event was held at the Palace of Versailles, a symbol of French monarchy."
	],
	[
	"At Versailles, Marie Antoinette, the Queen of France, was involved in discussions."
	],
	[
	"Maximilien Robespierre, a leading member of the National Assembly, also participated."
	],
	[
	"Jean-Jacques Rousseau, the famous philosopher, was a significant figure in the debate."
	],
	[
	"Another important participant was Charles de Talleyrand, the Bishop of Autun."
	],
	[
	"Meanwhile, across the Atlantic, George Washington, the first President of the United States, was shaping policies."
	],
	[
	"Thomas Jefferson, the nation's Secretary of State, played a key role in drafting policies for the new American government."
	],
	],
	live=False,
	)

	interface.launch(share=True)


	# Run the app
	if __name__ == "__main__":
	ner_app_interface()