Spaces:
Running
Running
File size: 7,669 Bytes
bbcf937 59c3f8c bbcf937 44b938c 24d58c0 117cafd cb76a4b 0bec8b3 542aecd 7f5499f 4ac935a bbcf937 9494755 4ac935a 9494755 c3e1350 42d1bed dedd775 9494755 40806d0 a168d09 fdfd405 9494755 dedd775 320ee5a 9494755 bbcf937 320ee5a c9574f5 b126447 c9574f5 971e940 44b938c b126447 9d9274e 44b938c 9d9274e bbcf937 3dac3c5 c9574f5 bbcf937 dd4ee36 bbcf937 117cafd 9494755 b126447 890d925 9e9596c 890d925 d8a2dff 890d925 dd4ee36 117cafd fdfd405 0fe6ed0 dd4ee36 117cafd 0fe6ed0 117cafd 0fe6ed0 117cafd 98acdc3 117cafd 98acdc3 117cafd 98acdc3 a4303d6 9d9274e a4303d6 98acdc3 117cafd 98acdc3 117cafd 98acdc3 117cafd 98acdc3 9d9274e 5cb9d08 44b938c 5cb9d08 542aecd 5cb9d08 542aecd 5cb9d08 24d58c0 49703d7 31c00d2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 |
import streamlit as st
from annotated_text import annotated_text
from refined.inference.processor import Refined
import requests
import json
import spacy
# Page config
st.set_page_config(
page_title="Entity Linking by WordLift",
page_icon="fav-ico.png",
layout="wide",
initial_sidebar_state="collapsed",
menu_items={
'Get Help': 'https://wordlift.io/book-a-demo/',
'About': "# This is a demo app for NEL/NED/NER and SEO"
}
)
# Sidebar
st.sidebar.image("logo-wordlift.png")
language_options = {"English", "German"}
selected_language = st.sidebar.selectbox("Select the Language", list(language_options), index=0)
# Based on selected language, display model and entity set options
if selected_language != "German":
# Only show these options for languages other than German
model_options = {"aida_model", "wikipedia_model_with_numbers"}
selected_model_name = st.sidebar.selectbox("Select the Model", list(model_options))
# Select entity_set
entity_set_options = {"wikidata", "wikipedia"}
selected_entity_set = st.sidebar.selectbox("Select the Entity Set", list(entity_set_options))
# Add citation for Refined
citation = """
@inproceedings{ayoola-etal-2022-refined,
title = "{R}e{F}in{ED}: An Efficient Zero-shot-capable Approach to End-to-End Entity Linking",
author = "Tom Ayoola, Shubhi Tyagi, Joseph Fisher, Christos Christodoulopoulos, Andrea Pierleoni",
booktitle = "NAACL",
year = "2022"
}
"""
with st.sidebar.expander('Citations'):
st.markdown(citation)
else:
selected_model_name = None
selected_entity_set = None
# Add citation for entity fishing
citation = """
@misc{entity-fishing,
title = {entity-fishing},
howpublished = {\url{https://github.com/kermitt2/entity-fishing}},
publisher = {GitHub},
year = {2016--2023},
archivePrefix = {swh},
eprint = {1:dir:cb0ba3379413db12b0018b7c3af8d0d2d864139c}
}"""
with st.sidebar.expander('Citations'):
st.markdown(citation)
@st.cache_resource # 👈 Add the caching decorator
def load_model(selected_language, model_name=None, entity_set=None):
if selected_language == "German":
# Load the German-specific model
nlp_model_de = spacy.load("de_core_news_lg")
nlp_model_de.add_pipe("entityfishing")
return nlp_model_de
else:
# Load the pretrained model for other languages
refined_model = Refined.from_pretrained(model_name=model_name, entity_set=entity_set)
return refined_model
# Use the cached model
model = load_model(selected_language, selected_model_name, selected_entity_set)
# Helper functions
def get_wikidata_id(entity_string):
entity_list = entity_string.split("=")
entity_id = str(entity_list[1])
entity_link = "http://www.wikidata.org/entity/" + entity_id
return {"id": entity_id, "link": entity_link}
def get_entity_data(entity_link):
try:
# Format the entity_link
formatted_link = entity_link.replace("http://", "http/")
response = requests.get(f'https://api.wordlift.io/id/{formatted_link}')
return response.json()
except Exception as e:
print(f"Exception when fetching data for entity: {entity_link}. Exception: {e}")
return None
# Create the form
with st.form(key='my_form'):
text_input = st.text_area(label='Enter a sentence')
submit_button = st.form_submit_button(label='Analyze')
# Initialization
entities_map = {}
entities_data = {}
if text_input:
if selected_language == "German":
doc_de = model(text_input)
entities = [(ent.text, ent.label_, ent._.kb_qid, ent._.url_wikidata) for ent in doc_de.ents]
for entity in entities:
entity_string, entity_type, wikidata_id, wikidata_url = entity
if wikidata_url:
# Ensure correct format for the German model
formatted_wikidata_url = wikidata_url.replace("https://www.wikidata.org/wiki/", "http://www.wikidata.org/entity/")
entities_map[entity_string] = {"id": wikidata_id, "link": formatted_wikidata_url}
entity_data = get_entity_data(formatted_wikidata_url)
if entity_data is not None:
entities_data[entity_string] = entity_data
else:
entities = model.process_text(text_input)
for entity in entities:
single_entity_list = str(entity).strip('][').replace("\'", "").split(', ')
if len(single_entity_list) >= 2 and "wikidata" in single_entity_list[1]:
entities_map[single_entity_list[0].strip()] = get_wikidata_id(single_entity_list[1])
entity_data = get_entity_data(entities_map[single_entity_list[0].strip()]["link"])
if entity_data is not None:
entities_data[single_entity_list[0].strip()] = entity_data
combined_entity_info_dictionary = dict([(k, [entities_map[k], entities_data[k] if k in entities_data else None]) for k in entities_map])
if submit_button:
# Prepare a list to hold the final output
final_text = []
# JSON-LD data
json_ld_data = {
"@context": "https://schema.org",
"@type": "WebPage",
"mentions": []
}
# Replace each entity in the text with its annotated version
for entity_string, entity_info in entities_map.items():
# Check if the entity has a valid Wikidata link
if entity_info["link"] is None or entity_info["link"] == "None":
continue # skip this entity
entity_data = entities_data.get(entity_string, None)
entity_type = None
if entity_data is not None:
entity_type = entity_data.get("@type", None)
# Use different colors based on the entity's type
color = "#8ef" # Default color
if entity_type == "Place":
color = "#8AC7DB"
elif entity_type == "Organization":
color = "#ADD8E6"
elif entity_type == "Person":
color = "#67B7D1"
elif entity_type == "Product":
color = "#2ea3f2"
elif entity_type == "CreativeWork":
color = "#00BFFF"
elif entity_type == "Event":
color = "#1E90FF"
entity_annotation = (entity_string, entity_info["id"], color)
text_input = text_input.replace(entity_string, f'{{{str(entity_annotation)}}}', 1)
# Add the entity to JSON-LD data
entity_json_ld = combined_entity_info_dictionary[entity_string][1]
if entity_json_ld and entity_json_ld.get("link") != "None":
json_ld_data["mentions"].append(entity_json_ld)
# Split the modified text_input into a list
text_list = text_input.split("{")
for item in text_list:
if "}" in item:
item_list = item.split("}")
final_text.append(eval(item_list[0]))
if len(item_list[1]) > 0:
final_text.append(item_list[1])
else:
final_text.append(item)
# Pass the final_text to the annotated_text function
annotated_text(*final_text)
with st.expander("See annotations"):
st.write(combined_entity_info_dictionary)
with st.expander("Here is the final JSON-LD"):
st.json(json_ld_data) # Output JSON-LD |