File size: 6,147 Bytes
06e9286 6627fc9 06e9286 aeeec0d 06e9286 44fb8bb 6627fc9 94113a9 da7878b 6627fc9 44fb8bb 6627fc9 44fb8bb 6627fc9 da7878b 06e9286 94113a9 6627fc9 94113a9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import requests
tokenizer = AutoTokenizer.from_pretrained("impresso-project/nel-hipe-multilingual")
model = AutoModelForSeq2SeqLM.from_pretrained(
"impresso-project/nel-hipe-multilingual"
).eval()
print("Model loaded successfully!")
def get_wikipedia_page_props(input_str: str):
"""
Retrieves the QID for a given Wikipedia page name from the specified language Wikipedia.
If the request fails, it falls back to using the OpenRefine Wikidata API.
Args:
input_str (str): The input string in the format "page_name >> language".
Returns:
str: The QID or "NIL" if the QID is not found.
"""
# # Check if the result is already in the cache
# if input_str in cache:
# return cache[input_str]
try:
# Preprocess the input string
page_name, language = input_str.split(" >> ")
page_name = page_name.strip()
language = language.strip()
except ValueError:
return "Invalid input format. Use 'page_name >> language'."
wikipedia_url = f"https://{language}.wikipedia.org/w/api.php"
wikipedia_params = {
"action": "query",
"prop": "pageprops",
"format": "json",
"titles": page_name,
}
qid = "NIL"
try:
# Attempt to fetch from Wikipedia API
response = requests.get(wikipedia_url, params=wikipedia_params)
response.raise_for_status()
data = response.json()
if "pages" in data["query"]:
page_id = list(data["query"]["pages"].keys())[0]
if "pageprops" in data["query"]["pages"][page_id]:
page_props = data["query"]["pages"][page_id]["pageprops"]
if "wikibase_item" in page_props:
return page_props["wikibase_item"]
else:
# If no page properties found, fall back to OpenRefine Wikidata API
return qid # fallback_to_openrefine(page_name, language)
else:
return qid # fallback_to_openrefine(page_name, language)
except Exception as e:
# save_cache(cache, cache_file)
return qid # fallback_to_openrefine(page_name, language)
def get_wikipedia_title(qid, language="en"):
url = f"https://www.wikidata.org/w/api.php"
params = {
"action": "wbgetentities",
"format": "json",
"ids": qid,
"props": "sitelinks/urls",
"sitefilter": f"{language}wiki",
}
response = requests.get(url, params=params)
data = response.json()
try:
title = data["entities"][qid]["sitelinks"][f"{language}wiki"]["title"]
url = data["entities"][qid]["sitelinks"][f"{language}wiki"]["url"]
return title, url
except KeyError:
return "NIL", "None"
def disambiguate_sentence(sentence):
entities = []
# Generate model outputs for the sentence
outputs = model.generate(
**tokenizer([sentence], return_tensors="pt"),
num_beams=5,
num_return_sequences=5,
max_new_tokens=30,
)
decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
print(f"Decoded: {decoded}")
wikipedia_name = decoded[0] # Assuming QID is the last token in the output
qid = get_wikipedia_page_props(wikipedia_name)
print(f"qid: {qid}")
#
# # Get Wikipedia title and URL
title, url = get_wikipedia_title(qid)
#
entity_info = f"QID: {qid}, Title: {title}, URL: {url}"
entities.append(entity_info)
print(f"Entities: {entities}")
return {"text": sentence, "entities": entities}
def nel_app_interface():
input_sentence = gr.Textbox(
lines=5,
label="Input Sentence",
placeholder="Enter your sentence here in the following format: // << We are going to [START] Paris [END]. >>"
" // This format ensures that the model knows which entities to disambiguate, more exactly the "
"entity should be surrounded by `[START]` and `[END]`. // "
"!Only one entity per sentence is supported at the moment!",
)
output_entities = gr.Textbox(label="Linked Entities")
# Interface definition
interface = gr.Interface(
fn=disambiguate_sentence,
inputs=input_sentence,
outputs=output_entities,
title="Entity Linking with impresso-project/nel-hipe-multilingual",
description="Link entities using the `impresso-project/nel-hipe-multilingual` model under the hood!",
examples=[
[
"Des chercheurs de l' [START] Université de Cambridge [END] ont développé une nouvelle technique de calcul "
"quantique qui promet d'augmenter exponentiellement les vitesses de calcul.",
"Le rapport complet sur ces découvertes a été publié dans la prestigieuse revue 'Nature Physics'. ([START] "
"Reuters [END])",
"In the [START] year 1789 [END], the Estates-General was convened in France.",
"[START] King Louis XVI, ruler of France [END], called for the meeting.",
"The event was held at the [START] Palace of Versailles [END], a symbol of French monarchy.",
"At Versailles, Marie Antoinette, the Queen of France, was involved in discussions.",
"Maximilien Robespierre, a leading member of the National Assembly, also participated.",
"[START] Jean-Jacques Rousseau, the famous philosopher [END], was a significant figure in the debate.",
"Another important participant was [START] Charles de Talleyrand, the Bishop of Autun [END].",
"Meanwhile, across the Atlantic, [START] George Washington, the first President of the United States [END], "
"was shaping policies.",
"[START] Thomas Jefferson, the nation's Secretary of State [END], played a key role in drafting policies for "
"the new American government.",
]
],
)
interface.launch()
if __name__ == "__main__":
nel_app_interface()
|