Spaces:

impresso-project
/

multilingual-entity-linking

Sleeping

File size: 6,166 Bytes

import gradio as gr
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import requests

tokenizer = AutoTokenizer.from_pretrained("impresso-project/nel-hipe-multilingual")
model = AutoModelForSeq2SeqLM.from_pretrained(
    "impresso-project/nel-hipe-multilingual"
).eval()

print("Model loaded successfully!")


def get_wikipedia_page_props(input_str: str):
    """
    Retrieves the QID for a given Wikipedia page name from the specified language Wikipedia.
    If the request fails, it falls back to using the OpenRefine Wikidata API.

    Args:
        input_str (str): The input string in the format "page_name >> language".

    Returns:
        str: The QID or "NIL" if the QID is not found.
    """
    try:
        # Preprocess the input string
        page_name, language = input_str.split(" >> ")
        page_name = page_name.strip()
        language = language.strip()
    except ValueError:
        return "Invalid input format. Use 'page_name >> language'."

    wikipedia_url = f"https://{language}.wikipedia.org/w/api.php"
    wikipedia_params = {
        "action": "query",
        "prop": "pageprops",
        "format": "json",
        "titles": page_name,
    }

    qid = "NIL"
    try:
        # Attempt to fetch from Wikipedia API
        response = requests.get(wikipedia_url, params=wikipedia_params)
        response.raise_for_status()
        data = response.json()

        if "pages" in data["query"]:
            page_id = list(data["query"]["pages"].keys())[0]

            if "pageprops" in data["query"]["pages"][page_id]:
                page_props = data["query"]["pages"][page_id]["pageprops"]

                if "wikibase_item" in page_props:
                    return page_props["wikibase_item"]
                else:
                    return qid
            else:
                return qid
    except Exception as e:
        return qid


def get_wikipedia_title(qid, language="en"):
    url = f"https://www.wikidata.org/w/api.php"
    params = {
        "action": "wbgetentities",
        "format": "json",
        "ids": qid,
        "props": "sitelinks/urls",
        "sitefilter": f"{language}wiki",
    }

    response = requests.get(url, params=params)
    data = response.json()

    try:
        title = data["entities"][qid]["sitelinks"][f"{language}wiki"]["title"]
        url = data["entities"][qid]["sitelinks"][f"{language}wiki"]["url"]
        return title, url
    except KeyError:
        return "NIL", "None"


def disambiguate_sentence(sentence):
    # Generate model outputs for the sentence
    outputs = model.generate(
        **tokenizer([sentence], return_tensors="pt"),
        num_beams=5,
        num_return_sequences=5,
        max_new_tokens=30,
    )
    decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    print(f"Decoded: {decoded}")
    wikipedia_name = decoded[0]  # Assuming the entity name is in the output
    qid = get_wikipedia_page_props(wikipedia_name)
    print(f"QID: {qid}")

    # Get Wikipedia title and URL
    title, url = get_wikipedia_title(qid)

    if qid == "NIL":
        return "No entity found."

    # Create an HTML output with a clickable link
    entity_info = f"""<div>
        <strong>Entity:</strong> {title} <br>
        <strong>QID:</strong> {qid} <br>
        <a href="{url}" target="_blank">Wikipedia Page</a>
    </div>
    """
    return entity_info


def nel_app_interface():
    input_sentence = gr.Textbox(
        lines=5,
        label="Input Sentence",
        placeholder="Enter your sentence here:",
    )
    output_entities = gr.HTML(label="Linked Entity")

    # Interface definition
    interface = gr.Interface(
        fn=disambiguate_sentence,
        inputs=input_sentence,
        outputs=output_entities,
        title="Entity Linking with impresso-project/nel-hipe-multilingual",
        description="Link entities using the `impresso-project/nel-hipe-multilingual` model under the hood!  "
        "We recommend using shorter texts (ie sentences, not full paragraphs). <br>"
        "The sentences in the following format: <br>"
        "<< We are going to [START] Paris [END].>> "
        "This format ensures that the model knows which entities to disambiguate, more exactly the "
        "entity should be surrounded by `[START]` and `[END]`. <br>"
        "Warning: Only one entity per sentence is supported at the moment!",
        examples=[
            [
                "Des chercheurs de l' [START] Université de Cambridge [END] ont développé une nouvelle technique de calcul quantique qui promet d'augmenter exponentiellement les vitesses de calcul."
            ],
            [
                "Le rapport complet sur ces découvertes a été publié dans la prestigieuse revue 'Nature Physics'. ([START] Reuters [END])"
            ],
            [
                "In the [START] year 1789 [END], the Estates-General was convened in France."
            ],
            ["[START] King Louis XVI, ruler of France [END], called for the meeting."],
            [
                "The event was held at the [START] Palace of Versailles [END], a symbol of French monarchy."
            ],
            [
                "At Versailles, Marie Antoinette, the Queen of France, was involved in discussions."
            ],
            [
                "Maximilien Robespierre, a leading member of the National Assembly, also participated."
            ],
            [
                "[START] Jean-Jacques Rousseau, the famous philosopher [END], was a significant figure in the debate."
            ],
            [
                "Another important participant was [START] Charles de Talleyrand, the Bishop of Autun [END]."
            ],
            [
                "Meanwhile, across the Atlantic, [START] George Washington, the first President of the United States [END], was shaping policies."
            ],
            [
                "[START] Thomas Jefferson, the nation's Secretary of State [END], played a key role in drafting policies for the new American government."
            ],
        ],
    )

    interface.launch()


if __name__ == "__main__":
    nel_app_interface()