File size: 6,118 Bytes
06e9286 048754e 06e9286 6627fc9 06e9286 048754e 06e9286 aeeec0d 06e9286 44fb8bb 1a30cc1 44fb8bb 1a30cc1 44fb8bb 1a30cc1 44fb8bb 6627fc9 94113a9 6627fc9 048754e 6627fc9 048754e 1a30cc1 ac76025 789d78c a5412ea 048754e 1a30cc1 06e9286 94113a9 ac76025 94113a9 048754e 94113a9 ac76025 74ff55c 842dbe5 ac76025 682e72c 842dbe5 94113a9 ac76025 8458bdc ac76025 a5412ea ac76025 94113a9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 |
import gradio as gr
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import requests
NEL_MODEL_NAME = "impresso-project/nel-mgenre-multilingual"
# Load the tokenizer and model from the specified pre-trained model name
# The model used here is "https://huggingface.co/impresso-project/nel-mgenre-multilingual"
nel_tokenizer = AutoTokenizer.from_pretrained(
"impresso-project/nel-mgenre-multilingual"
)
nel_pipeline = pipeline(
"generic-nel",
model=NEL_MODEL_NAME,
tokenizer=nel_tokenizer,
trust_remote_code=True,
device="cpu",
)
print("Model loaded successfully!")
def get_wikipedia_page_props(input_str: str):
"""
Retrieves the QID for a given Wikipedia page name from the specified language Wikipedia.
If the request fails, it falls back to using the OpenRefine Wikidata API.
Args:
input_str (str): The input string in the format "page_name >> language".
Returns:
str: The QID or "NIL" if the QID is not found.
"""
try:
# Preprocess the input string
page_name, language = input_str.split(" >> ")
page_name = page_name.strip()
language = language.strip()
except ValueError:
return "Invalid input format. Use 'page_name >> language'."
wikipedia_url = f"https://{language}.wikipedia.org/w/api.php"
wikipedia_params = {
"action": "query",
"prop": "pageprops",
"format": "json",
"titles": page_name,
}
qid = "NIL"
try:
# Attempt to fetch from Wikipedia API
response = requests.get(wikipedia_url, params=wikipedia_params)
response.raise_for_status()
data = response.json()
if "pages" in data["query"]:
page_id = list(data["query"]["pages"].keys())[0]
if "pageprops" in data["query"]["pages"][page_id]:
page_props = data["query"]["pages"][page_id]["pageprops"]
if "wikibase_item" in page_props:
return page_props["wikibase_item"]
else:
return qid
else:
return qid
except Exception as e:
return qid
def get_wikipedia_title(qid, language="en"):
url = f"https://www.wikidata.org/w/api.php"
params = {
"action": "wbgetentities",
"format": "json",
"ids": qid,
"props": "sitelinks/urls",
"sitefilter": f"{language}wiki",
}
response = requests.get(url, params=params)
data = response.json()
try:
title = data["entities"][qid]["sitelinks"][f"{language}wiki"]["title"]
url = data["entities"][qid]["sitelinks"][f"{language}wiki"]["url"]
return title, url
except KeyError:
return "NIL", "None"
def disambiguate_sentence(sentence):
# Generate model outputs for the sentence
linked_entity = nel_pipeline(sentence)
linked_entity = linked_entity[0]
# Create an HTML output with a clickable link
entity_info = f"""<div>
<strong>Entity:</strong> {linked_entity['surface']} <br>
<strong>Wikidata QID:</strong> {linked_entity['wkd_id']} <br>
<strong>Wikipedia Title:</strong> {linked_entity['wkpedia_pagename']} <br>
<a href="{linked_entity['url']}" target="_blank">Wikipedia Page</a>
</div>
"""
return entity_info
def nel_app_interface():
input_sentence = gr.Textbox(
lines=5,
label="Input Sentence",
placeholder="Enter your sentence here:",
)
output_entities = gr.HTML(label="Linked Entities:")
# Interface definition
interface = gr.Interface(
fn=disambiguate_sentence,
inputs=input_sentence,
outputs=output_entities,
title="Entity Linking with impresso-project/nel-hipe-multilingual",
description="Link entities using the `impresso-project/nel-hipe-multilingual` model under the hood! "
"We recommend using shorter texts (ie sentences, not full paragraphs). <br>"
"The sentences in the following format: <br>"
"<it><< We are going to `[START]` Paris `[END]` >></it> <br>"
"This format ensures that the model knows which entities to disambiguate, more exactly the "
"entity should be surrounded by `[START]` and `[END]`. <br> <br>"
"<b>Warning<b>: Only one entity per sentence is supported at the moment!",
examples=[
[
"Des chercheurs de l' [START] Université de Cambridge [END] ont développé une nouvelle technique de calcul quantique qui promet d'augmenter exponentiellement les vitesses de calcul."
],
[
"Le rapport complet sur ces découvertes a été publié dans la prestigieuse revue 'Nature Physics'. ([START] Reuters [END])"
],
[
"In the [START] year 1789 [END], the Estates-General was convened in France."
],
[
"The event was held at the [START] Palace of Versailles [END], a symbol of French monarchy."
],
[
"At Versailles, [START] Antoinette, the Queen of France [END], was involved in discussions."
],
[
"[START] Maximilien Robespierre [END], a leading member of the National Assembly, also participated."
],
[
"[START] Jean-Jacques Rousseau, the famous philosopher [END], was a significant figure in the debate."
],
[
"Another important participant was [START] Charles de Talleyrand, the Bishop of Autun [END]."
],
[
"Meanwhile, across the Atlantic, [START] George Washington, the first President of the United States [END], was shaping policies."
],
[
"[START] Thomas Jefferson, the nation's Secretary of State [END], played a key role in drafting policies for the new American government."
],
],
)
interface.launch()
if __name__ == "__main__":
nel_app_interface()
|