File size: 6,166 Bytes
06e9286 6627fc9 06e9286 aeeec0d 06e9286 44fb8bb 1a30cc1 44fb8bb 1a30cc1 44fb8bb 1a30cc1 44fb8bb 6627fc9 94113a9 6627fc9 44fb8bb 1a30cc1 44fb8bb 1a30cc1 6627fc9 1a30cc1 ac76025 1a30cc1 06e9286 94113a9 ac76025 94113a9 1a30cc1 94113a9 ac76025 74ff55c ac76025 74ff55c ac76025 94113a9 ac76025 94113a9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 |
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import requests
tokenizer = AutoTokenizer.from_pretrained("impresso-project/nel-hipe-multilingual")
model = AutoModelForSeq2SeqLM.from_pretrained(
"impresso-project/nel-hipe-multilingual"
).eval()
print("Model loaded successfully!")
def get_wikipedia_page_props(input_str: str):
"""
Retrieves the QID for a given Wikipedia page name from the specified language Wikipedia.
If the request fails, it falls back to using the OpenRefine Wikidata API.
Args:
input_str (str): The input string in the format "page_name >> language".
Returns:
str: The QID or "NIL" if the QID is not found.
"""
try:
# Preprocess the input string
page_name, language = input_str.split(" >> ")
page_name = page_name.strip()
language = language.strip()
except ValueError:
return "Invalid input format. Use 'page_name >> language'."
wikipedia_url = f"https://{language}.wikipedia.org/w/api.php"
wikipedia_params = {
"action": "query",
"prop": "pageprops",
"format": "json",
"titles": page_name,
}
qid = "NIL"
try:
# Attempt to fetch from Wikipedia API
response = requests.get(wikipedia_url, params=wikipedia_params)
response.raise_for_status()
data = response.json()
if "pages" in data["query"]:
page_id = list(data["query"]["pages"].keys())[0]
if "pageprops" in data["query"]["pages"][page_id]:
page_props = data["query"]["pages"][page_id]["pageprops"]
if "wikibase_item" in page_props:
return page_props["wikibase_item"]
else:
return qid
else:
return qid
except Exception as e:
return qid
def get_wikipedia_title(qid, language="en"):
url = f"https://www.wikidata.org/w/api.php"
params = {
"action": "wbgetentities",
"format": "json",
"ids": qid,
"props": "sitelinks/urls",
"sitefilter": f"{language}wiki",
}
response = requests.get(url, params=params)
data = response.json()
try:
title = data["entities"][qid]["sitelinks"][f"{language}wiki"]["title"]
url = data["entities"][qid]["sitelinks"][f"{language}wiki"]["url"]
return title, url
except KeyError:
return "NIL", "None"
def disambiguate_sentence(sentence):
# Generate model outputs for the sentence
outputs = model.generate(
**tokenizer([sentence], return_tensors="pt"),
num_beams=5,
num_return_sequences=5,
max_new_tokens=30,
)
decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
print(f"Decoded: {decoded}")
wikipedia_name = decoded[0] # Assuming the entity name is in the output
qid = get_wikipedia_page_props(wikipedia_name)
print(f"QID: {qid}")
# Get Wikipedia title and URL
title, url = get_wikipedia_title(qid)
if qid == "NIL":
return "No entity found."
# Create an HTML output with a clickable link
entity_info = f"""<div>
<strong>Entity:</strong> {title} <br>
<strong>QID:</strong> {qid} <br>
<a href="{url}" target="_blank">Wikipedia Page</a>
</div>
"""
return entity_info
def nel_app_interface():
input_sentence = gr.Textbox(
lines=5,
label="Input Sentence",
placeholder="Enter your sentence here:",
)
output_entities = gr.HTML(label="Linked Entity")
# Interface definition
interface = gr.Interface(
fn=disambiguate_sentence,
inputs=input_sentence,
outputs=output_entities,
title="Entity Linking with impresso-project/nel-hipe-multilingual",
description="Link entities using the `impresso-project/nel-hipe-multilingual` model under the hood! "
"We recommend using shorter texts (ie sentences, not full paragraphs). <br>"
"The sentences in the following format: <br>"
"<< We are going to [START] Paris [END].>> "
"This format ensures that the model knows which entities to disambiguate, more exactly the "
"entity should be surrounded by `[START]` and `[END]`. <br>"
"Warning: Only one entity per sentence is supported at the moment!",
examples=[
[
"Des chercheurs de l' [START] Université de Cambridge [END] ont développé une nouvelle technique de calcul quantique qui promet d'augmenter exponentiellement les vitesses de calcul."
],
[
"Le rapport complet sur ces découvertes a été publié dans la prestigieuse revue 'Nature Physics'. ([START] Reuters [END])"
],
[
"In the [START] year 1789 [END], the Estates-General was convened in France."
],
["[START] King Louis XVI, ruler of France [END], called for the meeting."],
[
"The event was held at the [START] Palace of Versailles [END], a symbol of French monarchy."
],
[
"At Versailles, Marie Antoinette, the Queen of France, was involved in discussions."
],
[
"Maximilien Robespierre, a leading member of the National Assembly, also participated."
],
[
"[START] Jean-Jacques Rousseau, the famous philosopher [END], was a significant figure in the debate."
],
[
"Another important participant was [START] Charles de Talleyrand, the Bishop of Autun [END]."
],
[
"Meanwhile, across the Atlantic, [START] George Washington, the first President of the United States [END], was shaping policies."
],
[
"[START] Thomas Jefferson, the nation's Secretary of State [END], played a key role in drafting policies for the new American government."
],
],
)
interface.launch()
if __name__ == "__main__":
nel_app_interface()
|