|
import gradio as gr |
|
from transformers import pipeline |
|
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM |
|
import requests |
|
|
|
|
|
NEL_MODEL_NAME = "impresso-project/nel-mgenre-multilingual" |
|
|
|
|
|
|
|
nel_tokenizer = AutoTokenizer.from_pretrained( |
|
"impresso-project/nel-mgenre-multilingual" |
|
) |
|
|
|
nel_pipeline = pipeline( |
|
"generic-nel", |
|
model=NEL_MODEL_NAME, |
|
tokenizer=nel_tokenizer, |
|
trust_remote_code=True, |
|
device="cpu", |
|
) |
|
|
|
print("Model loaded successfully!") |
|
|
|
|
|
def get_wikipedia_page_props(input_str: str): |
|
""" |
|
Retrieves the QID for a given Wikipedia page name from the specified language Wikipedia. |
|
If the request fails, it falls back to using the OpenRefine Wikidata API. |
|
|
|
Args: |
|
input_str (str): The input string in the format "page_name >> language". |
|
|
|
Returns: |
|
str: The QID or "NIL" if the QID is not found. |
|
""" |
|
try: |
|
|
|
page_name, language = input_str.split(" >> ") |
|
page_name = page_name.strip() |
|
language = language.strip() |
|
except ValueError: |
|
return "Invalid input format. Use 'page_name >> language'." |
|
|
|
wikipedia_url = f"https://{language}.wikipedia.org/w/api.php" |
|
wikipedia_params = { |
|
"action": "query", |
|
"prop": "pageprops", |
|
"format": "json", |
|
"titles": page_name, |
|
} |
|
|
|
qid = "NIL" |
|
try: |
|
|
|
response = requests.get(wikipedia_url, params=wikipedia_params) |
|
response.raise_for_status() |
|
data = response.json() |
|
|
|
if "pages" in data["query"]: |
|
page_id = list(data["query"]["pages"].keys())[0] |
|
|
|
if "pageprops" in data["query"]["pages"][page_id]: |
|
page_props = data["query"]["pages"][page_id]["pageprops"] |
|
|
|
if "wikibase_item" in page_props: |
|
return page_props["wikibase_item"] |
|
else: |
|
return qid |
|
else: |
|
return qid |
|
except Exception as e: |
|
return qid |
|
|
|
|
|
def get_wikipedia_title(qid, language="en"): |
|
url = f"https://www.wikidata.org/w/api.php" |
|
params = { |
|
"action": "wbgetentities", |
|
"format": "json", |
|
"ids": qid, |
|
"props": "sitelinks/urls", |
|
"sitefilter": f"{language}wiki", |
|
} |
|
|
|
response = requests.get(url, params=params) |
|
data = response.json() |
|
|
|
try: |
|
title = data["entities"][qid]["sitelinks"][f"{language}wiki"]["title"] |
|
url = data["entities"][qid]["sitelinks"][f"{language}wiki"]["url"] |
|
return title, url |
|
except KeyError: |
|
return "NIL", "None" |
|
|
|
|
|
def disambiguate_sentence(sentence): |
|
|
|
linked_entity = nel_pipeline(sentence) |
|
|
|
linked_entity = linked_entity[0] |
|
|
|
|
|
entity_info = f"""<div> |
|
<strong>Entity:</strong> {linked_entity['surface']} <br> |
|
<strong>Wikidata QID:</strong> {linked_entity['wkd_id']} <br> |
|
<strong>Wikipedia Title:</strong> {linked_entity['wkpedia_pagename']} <br> |
|
<a href="{linked_entity['url']}" target="_blank">Wikipedia Page</a> |
|
</div> |
|
""" |
|
return entity_info |
|
|
|
|
|
def nel_app_interface(): |
|
input_sentence = gr.Textbox( |
|
lines=5, |
|
label="Input Sentence", |
|
placeholder="Enter your sentence here:", |
|
) |
|
output_entities = gr.HTML(label="Linked Entities:") |
|
|
|
|
|
interface = gr.Interface( |
|
fn=disambiguate_sentence, |
|
inputs=input_sentence, |
|
outputs=output_entities, |
|
title="Entity Linking with impresso-project/nel-hipe-multilingual", |
|
description="Link entities using the `impresso-project/nel-hipe-multilingual` model under the hood! " |
|
"We recommend using shorter texts (ie sentences, not full paragraphs). <br>" |
|
"The sentences in the following format: <br>" |
|
"<it><< We are going to `[START]` Paris `[END]` >></it> <br>" |
|
"This format ensures that the model knows which entities to disambiguate, more exactly the " |
|
"entity should be surrounded by `[START]` and `[END]`. <br> <br>" |
|
"<b>Warning<b>: Only one entity per sentence is supported at the moment!", |
|
examples=[ |
|
[ |
|
"Des chercheurs de l' [START] Université de Cambridge [END] ont développé une nouvelle technique de calcul quantique qui promet d'augmenter exponentiellement les vitesses de calcul." |
|
], |
|
[ |
|
"Le rapport complet sur ces découvertes a été publié dans la prestigieuse revue 'Nature Physics'. ([START] Reuters [END])" |
|
], |
|
[ |
|
"In the [START] year 1789 [END], the Estates-General was convened in France." |
|
], |
|
[ |
|
"The event was held at the [START] Palace of Versailles [END], a symbol of French monarchy." |
|
], |
|
[ |
|
"At Versailles, [START] Antoinette [END], the Queen of France, was involved in discussions." |
|
], |
|
[ |
|
"[START] Maximilien Robespierre [END], a leading member of the National Assembly, also participated." |
|
], |
|
[ |
|
"[START] Jean-Jacques Rousseau, the famous philosopher [END], was a significant figure in the debate." |
|
], |
|
[ |
|
"Another important participant was [START] Charles de Talleyrand, the Bishop of Autun [END]." |
|
], |
|
[ |
|
"Meanwhile, across the Atlantic, [START] George Washington, the first President of the United States [END], was shaping policies." |
|
], |
|
[ |
|
"[START] Thomas Jefferson, the nation's Secretary of State [END], played a key role in drafting policies for the new American government." |
|
], |
|
], |
|
) |
|
|
|
interface.launch() |
|
|
|
|
|
if __name__ == "__main__": |
|
nel_app_interface() |
|
|