Commit
Β·
44fb8bb
1
Parent(s):
6627fc9
update app
Browse files
app.py
CHANGED
@@ -10,6 +10,63 @@ model = AutoModelForSeq2SeqLM.from_pretrained(
|
|
10 |
print("Model loaded successfully!")
|
11 |
|
12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
def get_wikipedia_title(qid, language="en"):
|
14 |
url = f"https://www.wikidata.org/w/api.php"
|
15 |
params = {
|
@@ -41,11 +98,14 @@ def disambiguate_sentence(sentence):
|
|
41 |
max_new_tokens=30,
|
42 |
)
|
43 |
decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
|
44 |
-
|
45 |
-
|
46 |
-
|
|
|
|
|
|
|
47 |
title, url = get_wikipedia_title(qid)
|
48 |
-
|
49 |
entity_info = f"QID: {qid}, Title: {title}, URL: {url}"
|
50 |
entities.append(entity_info)
|
51 |
|
|
|
10 |
print("Model loaded successfully!")
|
11 |
|
12 |
|
13 |
+
def get_wikipedia_page_props(input_str: str):
|
14 |
+
"""
|
15 |
+
Retrieves the QID for a given Wikipedia page name from the specified language Wikipedia.
|
16 |
+
If the request fails, it falls back to using the OpenRefine Wikidata API.
|
17 |
+
|
18 |
+
Args:
|
19 |
+
input_str (str): The input string in the format "page_name >> language".
|
20 |
+
|
21 |
+
Returns:
|
22 |
+
str: The QID or "NIL" if the QID is not found.
|
23 |
+
"""
|
24 |
+
# # Check if the result is already in the cache
|
25 |
+
# if input_str in cache:
|
26 |
+
# return cache[input_str]
|
27 |
+
|
28 |
+
try:
|
29 |
+
# Preprocess the input string
|
30 |
+
page_name, language = input_str.split(" >> ")
|
31 |
+
page_name = page_name.strip()
|
32 |
+
language = language.strip()
|
33 |
+
except ValueError:
|
34 |
+
return "Invalid input format. Use 'page_name >> language'."
|
35 |
+
|
36 |
+
wikipedia_url = f"https://{language}.wikipedia.org/w/api.php"
|
37 |
+
wikipedia_params = {
|
38 |
+
"action": "query",
|
39 |
+
"prop": "pageprops",
|
40 |
+
"format": "json",
|
41 |
+
"titles": page_name,
|
42 |
+
}
|
43 |
+
|
44 |
+
qid = "NIL"
|
45 |
+
try:
|
46 |
+
# Attempt to fetch from Wikipedia API
|
47 |
+
response = requests.get(wikipedia_url, params=wikipedia_params)
|
48 |
+
response.raise_for_status()
|
49 |
+
data = response.json()
|
50 |
+
|
51 |
+
if "pages" in data["query"]:
|
52 |
+
page_id = list(data["query"]["pages"].keys())[0]
|
53 |
+
|
54 |
+
if "pageprops" in data["query"]["pages"][page_id]:
|
55 |
+
page_props = data["query"]["pages"][page_id]["pageprops"]
|
56 |
+
|
57 |
+
if "wikibase_item" in page_props:
|
58 |
+
return page_props["wikibase_item"]
|
59 |
+
else:
|
60 |
+
# If no page properties found, fall back to OpenRefine Wikidata API
|
61 |
+
return qid # fallback_to_openrefine(page_name, language)
|
62 |
+
else:
|
63 |
+
return qid # fallback_to_openrefine(page_name, language)
|
64 |
+
|
65 |
+
except Exception as e:
|
66 |
+
# save_cache(cache, cache_file)
|
67 |
+
return qid # fallback_to_openrefine(page_name, language)
|
68 |
+
|
69 |
+
|
70 |
def get_wikipedia_title(qid, language="en"):
|
71 |
url = f"https://www.wikidata.org/w/api.php"
|
72 |
params = {
|
|
|
98 |
max_new_tokens=30,
|
99 |
)
|
100 |
decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
|
101 |
+
print(f"Decoded: {decoded}")
|
102 |
+
wikipedia_name = decoded[0] # Assuming QID is the last token in the output
|
103 |
+
qid = get_wikipedia_page_props(wikipedia_name)
|
104 |
+
print(f"qid: {qid}")
|
105 |
+
#
|
106 |
+
# # Get Wikipedia title and URL
|
107 |
title, url = get_wikipedia_title(qid)
|
108 |
+
#
|
109 |
entity_info = f"QID: {qid}, Title: {title}, URL: {url}"
|
110 |
entities.append(entity_info)
|
111 |
|