File size: 6,147 Bytes
06e9286
 
6627fc9
06e9286
 
 
 
 
 
aeeec0d
 
06e9286
44fb8bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6627fc9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94113a9
da7878b
6627fc9
 
 
 
 
 
 
 
44fb8bb
 
 
 
 
 
6627fc9
44fb8bb
6627fc9
 
 
 
da7878b
06e9286
 
94113a9
 
 
 
 
 
 
 
 
6627fc9
94113a9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import requests

tokenizer = AutoTokenizer.from_pretrained("impresso-project/nel-hipe-multilingual")
model = AutoModelForSeq2SeqLM.from_pretrained(
    "impresso-project/nel-hipe-multilingual"
).eval()

print("Model loaded successfully!")


def get_wikipedia_page_props(input_str: str):
    """
    Retrieves the QID for a given Wikipedia page name from the specified language Wikipedia.
    If the request fails, it falls back to using the OpenRefine Wikidata API.

    Args:
        input_str (str): The input string in the format "page_name >> language".

    Returns:
        str: The QID or "NIL" if the QID is not found.
    """
    # # Check if the result is already in the cache
    # if input_str in cache:
    #     return cache[input_str]

    try:
        # Preprocess the input string
        page_name, language = input_str.split(" >> ")
        page_name = page_name.strip()
        language = language.strip()
    except ValueError:
        return "Invalid input format. Use 'page_name >> language'."

    wikipedia_url = f"https://{language}.wikipedia.org/w/api.php"
    wikipedia_params = {
        "action": "query",
        "prop": "pageprops",
        "format": "json",
        "titles": page_name,
    }

    qid = "NIL"
    try:
        # Attempt to fetch from Wikipedia API
        response = requests.get(wikipedia_url, params=wikipedia_params)
        response.raise_for_status()
        data = response.json()

        if "pages" in data["query"]:
            page_id = list(data["query"]["pages"].keys())[0]

            if "pageprops" in data["query"]["pages"][page_id]:
                page_props = data["query"]["pages"][page_id]["pageprops"]

                if "wikibase_item" in page_props:
                    return page_props["wikibase_item"]
                else:
                    # If no page properties found, fall back to OpenRefine Wikidata API
                    return qid  # fallback_to_openrefine(page_name, language)
            else:
                return qid  # fallback_to_openrefine(page_name, language)

    except Exception as e:
        # save_cache(cache, cache_file)
        return qid  # fallback_to_openrefine(page_name, language)


def get_wikipedia_title(qid, language="en"):
    url = f"https://www.wikidata.org/w/api.php"
    params = {
        "action": "wbgetentities",
        "format": "json",
        "ids": qid,
        "props": "sitelinks/urls",
        "sitefilter": f"{language}wiki",
    }

    response = requests.get(url, params=params)
    data = response.json()

    try:
        title = data["entities"][qid]["sitelinks"][f"{language}wiki"]["title"]
        url = data["entities"][qid]["sitelinks"][f"{language}wiki"]["url"]
        return title, url
    except KeyError:
        return "NIL", "None"


def disambiguate_sentence(sentence):
    entities = []
    # Generate model outputs for the sentence
    outputs = model.generate(
        **tokenizer([sentence], return_tensors="pt"),
        num_beams=5,
        num_return_sequences=5,
        max_new_tokens=30,
    )
    decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    print(f"Decoded: {decoded}")
    wikipedia_name = decoded[0]  # Assuming QID is the last token in the output
    qid = get_wikipedia_page_props(wikipedia_name)
    print(f"qid: {qid}")
    #
    # # Get Wikipedia title and URL
    title, url = get_wikipedia_title(qid)
    #
    entity_info = f"QID: {qid}, Title: {title}, URL: {url}"
    entities.append(entity_info)

    print(f"Entities: {entities}")
    return {"text": sentence, "entities": entities}


def nel_app_interface():
    input_sentence = gr.Textbox(
        lines=5,
        label="Input Sentence",
        placeholder="Enter your sentence here in the following format: //  << We are going to [START] Paris [END]. >>"
        " // This format ensures that the model knows which entities to disambiguate, more exactly the "
        "entity should be surrounded by `[START]` and `[END]`. // "
        "!Only one entity per sentence is supported at the moment!",
    )
    output_entities = gr.Textbox(label="Linked Entities")

    # Interface definition
    interface = gr.Interface(
        fn=disambiguate_sentence,
        inputs=input_sentence,
        outputs=output_entities,
        title="Entity Linking with impresso-project/nel-hipe-multilingual",
        description="Link entities using the `impresso-project/nel-hipe-multilingual` model under the hood!",
        examples=[
            [
                "Des chercheurs de l' [START] Université de Cambridge [END] ont développé une nouvelle technique de calcul "
                "quantique qui promet d'augmenter exponentiellement les vitesses de calcul.",
                "Le rapport complet sur ces découvertes a été publié dans la prestigieuse revue 'Nature Physics'. ([START] "
                "Reuters [END])",
                "In the [START] year 1789 [END], the Estates-General was convened in France.",
                "[START] King Louis XVI, ruler of France [END], called for the meeting.",
                "The event was held at the [START] Palace of Versailles [END], a symbol of French monarchy.",
                "At Versailles, Marie Antoinette, the Queen of France, was involved in discussions.",
                "Maximilien Robespierre, a leading member of the National Assembly, also participated.",
                "[START] Jean-Jacques Rousseau, the famous philosopher [END], was a significant figure in the debate.",
                "Another important participant was [START] Charles de Talleyrand, the Bishop of Autun [END].",
                "Meanwhile, across the Atlantic, [START] George Washington, the first President of the United States [END], "
                "was shaping policies.",
                "[START] Thomas Jefferson, the nation's Secretary of State [END], played a key role in drafting policies for "
                "the new American government.",
            ]
        ],
    )

    interface.launch()


if __name__ == "__main__":
    nel_app_interface()