Spaces:

impresso-project
/

multilingual-named-entity-recognition

Running

File size: 5,169 Bytes

d5d2a07
c619232
d5d2a07
 
 
 
 
 
 
5436b2b
 
 
 
 
 
 
 
d5d2a07
9df3a2f
563c6f6
7e788d5
9df3a2f
 
7e788d5
 
 
9df3a2f
 
 
 
 
 
 
 
 
 
 
7e788d5
9df3a2f
7e788d5
9df3a2f
 
 
e9c0cd2
 
 
 
 
 
 
49dd9a6
53e96e8
 
dd1b5ba
e9c0cd2
bf5bc24
5b5c00d
69a7750
 
09dc6c7
c2d3af5
 
d6100e0
 
 
 
 
e9c0cd2
d6100e0
 
b3b05b5
bf5f745
 
 
 
 
 
 
69a7750
8726584
d6100e0
 
8726584
53e96e8
8726584
0300379
d5d2a07
69a7750
d5d2a07
 
5436b2b
a836d61
5436b2b
9df3a2f
d5d2a07
 
 
 
 
e9c0cd2
 
d5d2a07
5436b2b
464c568
 
a836d61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
464c568
ac886a9
d5d2a07
5436b2b
ac886a9
d5d2a07
5436b2b
d5d2a07

import gradio as gr
from transformers import pipeline, AutoTokenizer

# Define the model name
MODEL_NAME = "impresso-project/ner-stacked-bert-multilingual"

# Load the tokenizer and model using the pipeline
ner_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

ner_pipeline = pipeline(
    "generic-ner",
    model=MODEL_NAME,
    tokenizer=ner_tokenizer,
    trust_remote_code=True,
    device="cpu",
)


def format_entities_as_html(entities):
    excluded_keys = {"start", "end", "index"}  # Keys to exclude from the output
    html_output = "<div>"

    for entity in entities:
        html_output += (
            "<div style='margin-bottom: 10px;'>"  # Each entity in a separate div
        )

        # Dynamically add all fields except the excluded ones
        for key, value in entity.items():
            if key not in excluded_keys:
                if isinstance(value, float):  # Format score if it's a float
                    html_output += (
                        f"<strong>{key.capitalize()}:</strong> {value:.2f}<br>"
                    )
                else:
                    html_output += f"<strong>{key.capitalize()}:</strong> {value}<br>"

        html_output += "</div>"

    html_output += "</div>"
    return html_output


# Function to process the sentence and extract entities
def extract_entities(sentence):
    results = ner_pipeline(sentence)

    # Debugging the result format
    print(f"NER results: {results}")

    entities = []
    seen_spans = set()  # Track the spans we have already added to avoid overlaps

    # Print debug info about tokenization
    print(f"Original text: {sentence}")
    print("Results:", results)
    # it should look like:
    # [{'entity': 'org.ent.pressagency.Reuters', 'score': np.float32(98.47),
    # 'index': 78, 'text': 'Reuters', 'start': 440, 'end': 447}]
    for entity in results:
        entity["start"] = entity["lOffset"]
        entity["end"] = entity["rOffset"]
        entity_span = (entity["start"], entity["end"])

        # Only add non-overlapping entities
        if entity_span not in seen_spans:
            seen_spans.add(entity_span)
            entity_text = sentence[
                entity["start"] : entity["end"]
            ].strip()  # Ensure we're working with the correct portion of the text
            entity["surface"] = entity_text
            label = f"{entity['type']}"
            if "title" in entity:
                label += f" - Title: {entity['title']}"
            if "name" in entity:
                label += f" - Name: {entity['name']}"
            if "function" in entity:
                label += f" - Function: {entity['function']}"
            entity["entity"] = label
            # print(f"Entity text: {entity}")

            entities.append(entity)
    print(f"Entities: {entities}")
    # Sort entities by their start position
    # entities = sorted(entities, key=lambda x: x["start"])
    return {"text": sentence, "entities": entities}


# Create Gradio interface
def ner_app_interface():
    input_sentence = gr.Textbox(
        lines=5, label="Input Sentence", placeholder="Enter a sentence for NER:"
    )
    output_entities = gr.HTML(label="Extracted Entities")

    # Interface definition
    interface = gr.Interface(
        fn=extract_entities,
        inputs=input_sentence,
        outputs=[gr.HighlightedText(label="Text with  mentions")],
        # outputs=output_entities,
        title="Named Entity Recognition",
        description="Enter a sentence to extract named entities using the NER model from the Impresso project.",
        examples=[
            [
                "Des chercheurs de l'Université de Cambridge ont développé une nouvelle technique de calcul quantique qui promet d'augmenter exponentiellement les vitesses de calcul."
            ],
            [
                "Le rapport complet sur ces découvertes a été publié dans la prestigieuse revue 'Nature Physics'. (Reuters)"
            ],
            ["In the year 1789, the Estates-General was convened in France."],
            [
                "The event was held at the Palace of Versailles, a symbol of French monarchy."
            ],
            [
                "At Versailles, Marie Antoinette, the Queen of France, was involved in discussions."
            ],
            [
                "Maximilien Robespierre, a leading member of the National Assembly, also participated."
            ],
            [
                "Jean-Jacques Rousseau, the famous philosopher, was a significant figure in the debate."
            ],
            [
                "Another important participant was Charles de Talleyrand, the Bishop of Autun."
            ],
            [
                "Meanwhile, across the Atlantic, George Washington, the first President of the United States, was shaping policies."
            ],
            [
                "Thomas Jefferson, the nation's Secretary of State, played a key role in drafting policies for the new American government."
            ],
        ],
        live=False,
    )

    interface.launch(share=True)


# Run the app
if __name__ == "__main__":
    ner_app_interface()