File size: 7,052 Bytes
bbcf937
59c3f8c
bbcf937
44b938c
24d58c0
117cafd
 
 
40ec3d4
117cafd
bbcf937
cb76a4b
 
 
 
 
 
 
 
 
 
 
 
0bec8b3
542aecd
117cafd
 
bbcf937
9494755
 
 
 
 
 
 
 
 
 
 
c3e1350
dedd775
9494755
 
 
 
fdfd405
 
9494755
 
 
 
 
dedd775
320ee5a
9494755
bbcf937
ba22fae
 
 
 
 
 
 
 
 
c3f646b
ba22fae
 
320ee5a
 
c9574f5
 
971e940
c9574f5
971e940
44b938c
 
 
 
 
 
 
 
bbcf937
 
3dac3c5
c9574f5
bbcf937
9494755
bbcf937
117cafd
9494755
0fe6ed0
 
383d6aa
 
 
117cafd
fdfd405
0fe6ed0
 
 
 
 
 
 
 
 
 
 
 
117cafd
0fe6ed0
117cafd
 
 
 
0fe6ed0
 
117cafd
98acdc3
 
 
117cafd
98acdc3
 
117cafd
 
 
 
 
98acdc3
a4303d6
 
 
 
98acdc3
117cafd
98acdc3
 
 
 
 
 
 
 
 
 
 
 
 
 
117cafd
98acdc3
 
117cafd
98acdc3
 
 
5cb9d08
 
 
44b938c
5cb9d08
 
 
 
 
 
542aecd
5cb9d08
542aecd
 
5cb9d08
24d58c0
49703d7
31c00d2
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
import streamlit as st
from annotated_text import annotated_text
from refined.inference.processor import Refined
import requests
import json
import spacy

# Load German model
nlp_model_de = spacy.load("de_core_news_sm")
nlp_model_de.add_pipe("entityfishing", config={"language": "de"})

# Page config
st.set_page_config(
    page_title="Entity Linking by WordLift",
    page_icon="fav-ico.png",
    layout="wide",
    initial_sidebar_state="collapsed",
    menu_items={
        'Get Help': 'https://wordlift.io/book-a-demo/',
        'About': "# This is a demo app for NEL/NED/NER and SEO"
    }
)

# Sidebar
st.sidebar.image("logo-wordlift.png")
language_options = {"English", "German"}
selected_language = st.sidebar.selectbox("Select the Language", list(language_options))

# Based on selected language, display model and entity set options
if selected_language != "German":
    # Only show these options for languages other than German
    model_options = {"aida_model", "wikipedia_model_with_numbers"}
    selected_model_name = st.sidebar.selectbox("Select the Model", list(model_options))
    # Select entity_set
    entity_set_options = {"wikidata", "wikipedia"}
    selected_entity_set = st.sidebar.selectbox("Select the Entity Set", list(entity_set_options))
else:
    selected_model_name = None
    selected_entity_set = None

@st.cache_resource  # 👈 Add the caching decorator
def load_model(selected_language, model_name=None, entity_set=None):
    if selected_language == "German":
        # Load the German-specific model
        nlp_model_de = spacy.load("de_core_news_sm")
        nlp_model_de.add_pipe("entityfishing", config={"language": "de"})
        
        return nlp_model_de
    else:
        # Load the pretrained model for other languages
        refined_model = Refined.from_pretrained(model_name=model_name, entity_set=entity_set)
        return refined_model

# Use the cached model
model = load_model(selected_language, selected_model_name, selected_entity_set)

# Addi citation
citation = """
@inproceedings{ayoola-etal-2022-refined,
title = "{R}e{F}in{ED}: An Efficient Zero-shot-capable Approach to End-to-End Entity Linking",
author = "Tom Ayoola, Shubhi Tyagi, Joseph Fisher, Christos Christodoulopoulos, Andrea Pierleoni",
booktitle = "NAACL",
year = "2022"
}
"""
with st.sidebar.expander('Citations'):
    st.markdown(citation)

# Helper functions
def get_wikidata_id(entity_string):
    entity_list = entity_string.split("=")
    entity_id = str(entity_list[1])
    entity_link = "http/www.wikidata.org/entity/" + entity_id
    return {"id": entity_id, "link": entity_link}
    
def get_entity_data(entity_link):
    try:
        response = requests.get(f'https://api.wordlift.io/id/{entity_link}')
        return response.json()
    except Exception as e:
        print(f"Exception when fetching data for entity: {entity_link}. Exception: {e}")
        return None
    
# Create the form
with st.form(key='my_form'):
    text_input = st.text_area(label='Enter a sentence')
    submit_button = st.form_submit_button(label='Analyze')

# When processing the text, check the language and adjust processing accordingly
if text_input:
    if selected_language == "German":
        doc_de = model(text_input)
        # Map entities to a format similar to English output
        entities = [(ent.text, ent.label_, ent._.kb_qid, ent._.url_wikidata) for ent in doc_de.ents]
        # Debug
        for ent in doc_de.ents:
            st.write(f"Entity: {ent.text}, Label: {ent.label_}, QID: {ent._.kb_qid}, URL: {ent._.url_wikidata}")
    else:
        entities = model.process_text(text_input)

    # Logic for English language processing
    entities_map = {}
    entities_data = {}
    for entity in entities:
        if selected_language == "German":
            entity_string, entity_type, wikidata_id, wikidata_url = entity
            entities_map[entity_string] = {"id": wikidata_id, "link": wikidata_url}
            entity_data = get_entity_data(wikidata_url)
            if entity_data is not None:
                entities_data[entity_string] = entity_data
        else:
            single_entity_list = str(entity).strip('][').replace("\'", "").split(', ')
            if len(single_entity_list) >= 2 and "wikidata" in single_entity_list[1]:
                entities_map[single_entity_list[0].strip()] = get_wikidata_id(single_entity_list[1])
                entity_data = get_entity_data(entities_map[single_entity_list[0].strip()]["link"])
                if entity_data is not None:
                    entities_data[single_entity_list[0].strip()] = entity_data

    combined_entity_info_dictionary = dict([(k, [entities_map[k], entities_data[k] if k in entities_data else None]) for k in entities_map])
    
    if submit_button:
        # Prepare a list to hold the final output
        final_text = []
    
        # JSON-LD data
        json_ld_data = {
                "@context": "https://schema.org",
                "@type": "WebPage",
                "mentions": []
            }
    
       # Replace each entity in the text with its annotated version
        for entity_string, entity_info in entities_map.items():
            entity_data = entities_data.get(entity_string, None)
            entity_type = None
            if entity_data is not None:
                entity_type = entity_data.get("@type", None)
    
            # Use different colors based on the entity's type
            color = "#8ef"  # Default color
            if entity_type == "Place":
                color = "#8AC7DB"
            elif entity_type == "Organization":
                color = "#ADD8E6"
            elif entity_type == "Person":
                color = "#67B7D1"
            elif entity_type == "Product":
                color = "#2ea3f2"
            elif entity_type == "CreativeWork":
                color = "#00BFFF"
            elif entity_type == "Event":
                color = "#1E90FF"
    
            entity_annotation = (entity_string, entity_info["id"], color)
            text_input = text_input.replace(entity_string, f'{{{str(entity_annotation)}}}', 1)
                
            # Add the entity to JSON-LD data
            entity_json_ld = combined_entity_info_dictionary[entity_string][1]
            json_ld_data["mentions"].append(entity_json_ld)

        # Split the modified text_input into a list
        text_list = text_input.split("{")
        
        for item in text_list:
            if "}" in item:
                item_list = item.split("}")
                final_text.append(eval(item_list[0]))
                if len(item_list[1]) > 0:
                    final_text.append(item_list[1])
            else:
                final_text.append(item)

        # Pass the final_text to the annotated_text function
        annotated_text(*final_text)
        
        with st.expander("See annotations"):
            st.write(combined_entity_info_dictionary)

        with st.expander("Here is the final JSON-LD"):
            st.json(json_ld_data)  # Output JSON-LD