|
import gradio as gr |
|
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM |
|
import requests |
|
|
|
tokenizer = AutoTokenizer.from_pretrained("impresso-project/nel-hipe-multilingual") |
|
model = AutoModelForSeq2SeqLM.from_pretrained( |
|
"impresso-project/nel-hipe-multilingual" |
|
).eval() |
|
|
|
print("Model loaded successfully!") |
|
|
|
|
|
def get_wikipedia_page_props(input_str: str): |
|
""" |
|
Retrieves the QID for a given Wikipedia page name from the specified language Wikipedia. |
|
If the request fails, it falls back to using the OpenRefine Wikidata API. |
|
|
|
Args: |
|
input_str (str): The input string in the format "page_name >> language". |
|
|
|
Returns: |
|
str: The QID or "NIL" if the QID is not found. |
|
""" |
|
try: |
|
|
|
page_name, language = input_str.split(" >> ") |
|
page_name = page_name.strip() |
|
language = language.strip() |
|
except ValueError: |
|
return "Invalid input format. Use 'page_name >> language'." |
|
|
|
wikipedia_url = f"https://{language}.wikipedia.org/w/api.php" |
|
wikipedia_params = { |
|
"action": "query", |
|
"prop": "pageprops", |
|
"format": "json", |
|
"titles": page_name, |
|
} |
|
|
|
qid = "NIL" |
|
try: |
|
|
|
response = requests.get(wikipedia_url, params=wikipedia_params) |
|
response.raise_for_status() |
|
data = response.json() |
|
|
|
if "pages" in data["query"]: |
|
page_id = list(data["query"]["pages"].keys())[0] |
|
|
|
if "pageprops" in data["query"]["pages"][page_id]: |
|
page_props = data["query"]["pages"][page_id]["pageprops"] |
|
|
|
if "wikibase_item" in page_props: |
|
return page_props["wikibase_item"] |
|
else: |
|
return qid |
|
else: |
|
return qid |
|
except Exception as e: |
|
return qid |
|
|
|
|
|
def get_wikipedia_title(qid, language="en"): |
|
url = f"https://www.wikidata.org/w/api.php" |
|
params = { |
|
"action": "wbgetentities", |
|
"format": "json", |
|
"ids": qid, |
|
"props": "sitelinks/urls", |
|
"sitefilter": f"{language}wiki", |
|
} |
|
|
|
response = requests.get(url, params=params) |
|
data = response.json() |
|
|
|
try: |
|
title = data["entities"][qid]["sitelinks"][f"{language}wiki"]["title"] |
|
url = data["entities"][qid]["sitelinks"][f"{language}wiki"]["url"] |
|
return title, url |
|
except KeyError: |
|
return "NIL", "None" |
|
|
|
|
|
def disambiguate_sentence(sentence): |
|
|
|
outputs = model.generate( |
|
**tokenizer([sentence], return_tensors="pt"), |
|
num_beams=5, |
|
num_return_sequences=5, |
|
max_new_tokens=30, |
|
) |
|
decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True) |
|
print(f"Decoded: {decoded}") |
|
wikipedia_name = decoded[0] |
|
qid = get_wikipedia_page_props(wikipedia_name) |
|
print(f"QID: {qid}") |
|
|
|
|
|
title, url = get_wikipedia_title(qid) |
|
|
|
if qid == "NIL": |
|
return "No entity found." |
|
|
|
|
|
entity_info = f"""<div> |
|
<strong>Entity:</strong> {title} <br> |
|
<strong>QID:</strong> {qid} <br> |
|
<a href="{url}" target="_blank">Wikipedia Page</a> |
|
</div> |
|
""" |
|
return entity_info |
|
|
|
|
|
def nel_app_interface(): |
|
input_sentence = gr.Textbox( |
|
lines=5, |
|
label="Input Sentence", |
|
placeholder="Enter your sentence here:", |
|
) |
|
output_entities = gr.HTML(label="Linked Entity") |
|
|
|
|
|
interface = gr.Interface( |
|
fn=disambiguate_sentence, |
|
inputs=input_sentence, |
|
outputs=output_entities, |
|
title="Entity Linking with impresso-project/nel-hipe-multilingual", |
|
description="Link entities using the `impresso-project/nel-hipe-multilingual` model under the hood! " |
|
"We recommend using shorter texts (ie sentences, not full paragraphs). <br>" |
|
"The sentences in the following format: <br>" |
|
"<< We are going to [START] Paris [END].>> " |
|
"This format ensures that the model knows which entities to disambiguate, more exactly the " |
|
"entity should be surrounded by `[START]` and `[END]`. <br>" |
|
"Warning: Only one entity per sentence is supported at the moment!", |
|
examples=[ |
|
[ |
|
"Des chercheurs de l' [START] Université de Cambridge [END] ont développé une nouvelle technique de calcul quantique qui promet d'augmenter exponentiellement les vitesses de calcul." |
|
], |
|
[ |
|
"Le rapport complet sur ces découvertes a été publié dans la prestigieuse revue 'Nature Physics'. ([START] Reuters [END])" |
|
], |
|
[ |
|
"In the [START] year 1789 [END], the Estates-General was convened in France." |
|
], |
|
["[START] King Louis XVI, ruler of France [END], called for the meeting."], |
|
[ |
|
"The event was held at the [START] Palace of Versailles [END], a symbol of French monarchy." |
|
], |
|
[ |
|
"At Versailles, Marie Antoinette, the Queen of France, was involved in discussions." |
|
], |
|
[ |
|
"Maximilien Robespierre, a leading member of the National Assembly, also participated." |
|
], |
|
[ |
|
"[START] Jean-Jacques Rousseau, the famous philosopher [END], was a significant figure in the debate." |
|
], |
|
[ |
|
"Another important participant was [START] Charles de Talleyrand, the Bishop of Autun [END]." |
|
], |
|
[ |
|
"Meanwhile, across the Atlantic, [START] George Washington, the first President of the United States [END], was shaping policies." |
|
], |
|
[ |
|
"[START] Thomas Jefferson, the nation's Secretary of State [END], played a key role in drafting policies for the new American government." |
|
], |
|
], |
|
) |
|
|
|
interface.launch() |
|
|
|
|
|
if __name__ == "__main__": |
|
nel_app_interface() |
|
|