Spaces:

impresso-project
/

multilingual-entity-linking

Running

App Files Files Community

emanuelaboros commited on Oct 17, 2024

Commit

44fb8bb

1 Parent(s): 6627fc9

update app

Browse files

Files changed (1) hide show

app.py +64 -4

app.py CHANGED Viewed

@@ -10,6 +10,63 @@ model = AutoModelForSeq2SeqLM.from_pretrained(
 print("Model loaded successfully!")
 def get_wikipedia_title(qid, language="en"):
     url = f"https://www.wikidata.org/w/api.php"
     params = {
@@ -41,11 +98,14 @@ def disambiguate_sentence(sentence):
         max_new_tokens=30,
     )
     decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
-    qid = decoded[0].split()[-1]  # Assuming QID is the last token in the output
-    # Get Wikipedia title and URL
     title, url = get_wikipedia_title(qid)
     entity_info = f"QID: {qid}, Title: {title}, URL: {url}"
     entities.append(entity_info)

 print("Model loaded successfully!")
+def get_wikipedia_page_props(input_str: str):
+    """
+    Retrieves the QID for a given Wikipedia page name from the specified language Wikipedia.
+    If the request fails, it falls back to using the OpenRefine Wikidata API.
+    Args:
+        input_str (str): The input string in the format "page_name >> language".
+    Returns:
+        str: The QID or "NIL" if the QID is not found.
+    """
+    # # Check if the result is already in the cache
+    # if input_str in cache:
+    #     return cache[input_str]
+    try:
+        # Preprocess the input string
+        page_name, language = input_str.split(" >> ")
+        page_name = page_name.strip()
+        language = language.strip()
+    except ValueError:
+        return "Invalid input format. Use 'page_name >> language'."
+    wikipedia_url = f"https://{language}.wikipedia.org/w/api.php"
+    wikipedia_params = {
+        "action": "query",
+        "prop": "pageprops",
+        "format": "json",
+        "titles": page_name,
+    }
+    qid = "NIL"
+    try:
+        # Attempt to fetch from Wikipedia API
+        response = requests.get(wikipedia_url, params=wikipedia_params)
+        response.raise_for_status()
+        data = response.json()
+        if "pages" in data["query"]:
+            page_id = list(data["query"]["pages"].keys())[0]
+            if "pageprops" in data["query"]["pages"][page_id]:
+                page_props = data["query"]["pages"][page_id]["pageprops"]
+                if "wikibase_item" in page_props:
+                    return page_props["wikibase_item"]
+                else:
+                    # If no page properties found, fall back to OpenRefine Wikidata API
+                    return qid  # fallback_to_openrefine(page_name, language)
+            else:
+                return qid  # fallback_to_openrefine(page_name, language)
+    except Exception as e:
+        # save_cache(cache, cache_file)
+        return qid  # fallback_to_openrefine(page_name, language)
 def get_wikipedia_title(qid, language="en"):
     url = f"https://www.wikidata.org/w/api.php"
     params = {
         max_new_tokens=30,
     )
     decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
+    print(f"Decoded: {decoded}")
+    wikipedia_name = decoded[0]  # Assuming QID is the last token in the output
+    qid = get_wikipedia_page_props(wikipedia_name)
+    print(f"qid: {qid}")
+    #
+    # # Get Wikipedia title and URL
     title, url = get_wikipedia_title(qid)
+    #
     entity_info = f"QID: {qid}, Title: {title}, URL: {url}"
     entities.append(entity_info)