File size: 3,575 Bytes
d5d2a07
c619232
d5d2a07
 
 
 
 
 
 
5436b2b
 
 
 
 
 
 
 
d5d2a07
9df3a2f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dd1b5ba
49dd9a6
 
53e96e8
 
dd1b5ba
 
 
5b5c00d
 
49dd9a6
 
53e96e8
 
 
 
 
dd1b5ba
 
 
2dac3ae
 
 
 
 
53e96e8
 
 
49dd9a6
9df3a2f
49dd9a6
 
d5d2a07
 
 
5436b2b
dd1b5ba
 
 
464c568
49dd9a6
5436b2b
d5d2a07
 
 
5436b2b
 
 
9df3a2f
d5d2a07
 
 
 
 
 
 
5436b2b
464c568
 
 
 
 
ac886a9
d5d2a07
5436b2b
ac886a9
d5d2a07
5436b2b
d5d2a07
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import gradio as gr
from transformers import pipeline, AutoTokenizer

# Define the model name
MODEL_NAME = "impresso-project/ner-stacked-bert-multilingual"

# Load the tokenizer and model using the pipeline
ner_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

ner_pipeline = pipeline(
    "generic-ner",
    model=MODEL_NAME,
    tokenizer=ner_tokenizer,
    trust_remote_code=True,
    device="cpu",
)


def format_entities_as_html(entities):
    excluded_keys = {"start", "end", "index", "word"}  # Keys to exclude from the output
    html_output = "<ul>"

    for entity in entities:
        html_output += "<li>"

        # Dynamically add all fields except the excluded ones
        for key, value in entity.items():
            if key not in excluded_keys:
                if isinstance(value, float):  # Format score if it's a float
                    html_output += (
                        f"<strong>{key.capitalize()}:</strong> {value:.2f}<br>"
                    )
                else:
                    html_output += f"<strong>{key.capitalize()}:</strong> {value}<br>"

        html_output += "</li>"

    html_output += "</ul>"
    return html_output


# Helper function to align entities correctly and debug tokenization
def prepare_entities_for_highlight(text, results):
    entities = []
    seen_spans = set()  # Track the spans we have already added to avoid overlaps

    # Print debug info about tokenization
    print(f"Original text: {text}")

    # it should look like:
    # [{'entity': 'org.ent.pressagency.Reuters', 'score': np.float32(98.47), 'index': 78, 'word': 'Reuters', 'start': 440, 'end': 447}]
    for category, entity_list in results.items():
        for entity in entity_list:
            entity_span = (entity["start"], entity["end"])

            # Only add non-overlapping entities
            if entity_span not in seen_spans:
                seen_spans.add(entity_span)
                entity_text = text[
                    entity["start"] : entity["end"]
                ].strip()  # Ensure we're working with the correct portion of the text
                entity["text"] = entity_text
                entity.pop("word")
                print(f"Entity text: {entity}")

                entities.append(entity)

    # Sort entities by their start position
    entities = sorted(entities, key=lambda x: x["start"])

    return format_entities_as_html(entities)


# Function to process the sentence and extract entities
def extract_entities(sentence):
    results = ner_pipeline(sentence)

    # Debugging the result format
    print(f"NER results: {results}")

    # Format the results for HighlightedText
    return prepare_entities_for_highlight(sentence, results)


# Create Gradio interface
def ner_app_interface():
    input_sentence = gr.Textbox(
        lines=5, label="Input Sentence", placeholder="Enter a sentence for NER..."
    )
    output_entities = gr.HTML(label="Extracted Entities")

    # Interface definition
    interface = gr.Interface(
        fn=extract_entities,
        inputs=input_sentence,
        outputs=output_entities,
        title="Named Entity Recognition",
        description="Enter a sentence to extract named entities using the NER model from the Impresso project.",
        examples=[
            [
                "In the year 1789, King Louis XVI, ruler of France, convened the Estates-General at the Palace of Versailles."
            ]
        ],
        live=False,
    )

    interface.launch(share=True)


# Run the app
if __name__ == "__main__":
    ner_app_interface()