emanuelaboros commited on
Commit
44fb8bb
Β·
1 Parent(s): 6627fc9

update app

Browse files
Files changed (1) hide show
  1. app.py +64 -4
app.py CHANGED
@@ -10,6 +10,63 @@ model = AutoModelForSeq2SeqLM.from_pretrained(
10
  print("Model loaded successfully!")
11
 
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  def get_wikipedia_title(qid, language="en"):
14
  url = f"https://www.wikidata.org/w/api.php"
15
  params = {
@@ -41,11 +98,14 @@ def disambiguate_sentence(sentence):
41
  max_new_tokens=30,
42
  )
43
  decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
44
- qid = decoded[0].split()[-1] # Assuming QID is the last token in the output
45
-
46
- # Get Wikipedia title and URL
 
 
 
47
  title, url = get_wikipedia_title(qid)
48
-
49
  entity_info = f"QID: {qid}, Title: {title}, URL: {url}"
50
  entities.append(entity_info)
51
 
 
10
  print("Model loaded successfully!")
11
 
12
 
13
+ def get_wikipedia_page_props(input_str: str):
14
+ """
15
+ Retrieves the QID for a given Wikipedia page name from the specified language Wikipedia.
16
+ If the request fails, it falls back to using the OpenRefine Wikidata API.
17
+
18
+ Args:
19
+ input_str (str): The input string in the format "page_name >> language".
20
+
21
+ Returns:
22
+ str: The QID or "NIL" if the QID is not found.
23
+ """
24
+ # # Check if the result is already in the cache
25
+ # if input_str in cache:
26
+ # return cache[input_str]
27
+
28
+ try:
29
+ # Preprocess the input string
30
+ page_name, language = input_str.split(" >> ")
31
+ page_name = page_name.strip()
32
+ language = language.strip()
33
+ except ValueError:
34
+ return "Invalid input format. Use 'page_name >> language'."
35
+
36
+ wikipedia_url = f"https://{language}.wikipedia.org/w/api.php"
37
+ wikipedia_params = {
38
+ "action": "query",
39
+ "prop": "pageprops",
40
+ "format": "json",
41
+ "titles": page_name,
42
+ }
43
+
44
+ qid = "NIL"
45
+ try:
46
+ # Attempt to fetch from Wikipedia API
47
+ response = requests.get(wikipedia_url, params=wikipedia_params)
48
+ response.raise_for_status()
49
+ data = response.json()
50
+
51
+ if "pages" in data["query"]:
52
+ page_id = list(data["query"]["pages"].keys())[0]
53
+
54
+ if "pageprops" in data["query"]["pages"][page_id]:
55
+ page_props = data["query"]["pages"][page_id]["pageprops"]
56
+
57
+ if "wikibase_item" in page_props:
58
+ return page_props["wikibase_item"]
59
+ else:
60
+ # If no page properties found, fall back to OpenRefine Wikidata API
61
+ return qid # fallback_to_openrefine(page_name, language)
62
+ else:
63
+ return qid # fallback_to_openrefine(page_name, language)
64
+
65
+ except Exception as e:
66
+ # save_cache(cache, cache_file)
67
+ return qid # fallback_to_openrefine(page_name, language)
68
+
69
+
70
  def get_wikipedia_title(qid, language="en"):
71
  url = f"https://www.wikidata.org/w/api.php"
72
  params = {
 
98
  max_new_tokens=30,
99
  )
100
  decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
101
+ print(f"Decoded: {decoded}")
102
+ wikipedia_name = decoded[0] # Assuming QID is the last token in the output
103
+ qid = get_wikipedia_page_props(wikipedia_name)
104
+ print(f"qid: {qid}")
105
+ #
106
+ # # Get Wikipedia title and URL
107
  title, url = get_wikipedia_title(qid)
108
+ #
109
  entity_info = f"QID: {qid}, Title: {title}, URL: {url}"
110
  entities.append(entity_info)
111