Update fix langlinks.py
Browse files- fix langlinks.py +42 -12
fix langlinks.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
-
import re
|
2 |
import tqdm
|
|
|
3 |
import json
|
4 |
import requests
|
5 |
from huggingface_hub import login
|
@@ -14,7 +14,13 @@ response = requests.get(json_url)
|
|
14 |
data = response.json()
|
15 |
|
16 |
# تحويل القاموس إلى قائمة من القواميس [{ "en": "value", "ar": "value" }, ...]
|
17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
for key, value in tqdm.tqdm(data.items()):
|
19 |
# "Category:1. FC Köln non-playing staff"
|
20 |
# remove " from start and end
|
@@ -27,16 +33,40 @@ for key, value in tqdm.tqdm(data.items()):
|
|
27 |
if value.startswith(':"') and value.endswith('",'):
|
28 |
value = value[2:-2]
|
29 |
# ----
|
30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
-
|
33 |
-
|
34 |
-
|
|
|
|
|
|
|
35 |
|
|
|
|
|
36 |
|
37 |
-
|
38 |
-
|
39 |
-
path_in_repo="langlinks_fixed.json", # المسار داخل المستودع
|
40 |
-
repo_id="Ibrahemqasim/enwiki_to_arwiki_categories", # معرف المستودع
|
41 |
-
# repo_type="dataset", # نوع المستودع (نستخدم dataset للملفات)
|
42 |
-
)
|
|
|
|
|
1 |
import tqdm
|
2 |
+
import re
|
3 |
import json
|
4 |
import requests
|
5 |
from huggingface_hub import login
|
|
|
14 |
data = response.json()
|
15 |
|
16 |
# تحويل القاموس إلى قائمة من القواميس [{ "en": "value", "ar": "value" }, ...]
|
17 |
+
|
18 |
+
data_lists = {
|
19 |
+
"langlinks" : {},
|
20 |
+
"filtered_data" : {},
|
21 |
+
"cats_2000" : {},
|
22 |
+
}
|
23 |
+
|
24 |
for key, value in tqdm.tqdm(data.items()):
|
25 |
# "Category:1. FC Köln non-playing staff"
|
26 |
# remove " from start and end
|
|
|
33 |
if value.startswith(':"') and value.endswith('",'):
|
34 |
value = value[2:-2]
|
35 |
# ----
|
36 |
+
# data_lists["langlinks"].append({"en": key, "ar": value})
|
37 |
+
data_lists["langlinks"][key] = value
|
38 |
+
# ----
|
39 |
+
# Add if key and value has 4 digits and they are the same
|
40 |
+
key_digits = re.search(r"\d{4}", key)
|
41 |
+
value_digits = re.search(r"\d{4}", value)
|
42 |
+
# ----
|
43 |
+
if key_digits and value_digits and key_digits.group() == value_digits.group():
|
44 |
+
# data_lists["filtered_data"].append({"en": key, "ar": value})
|
45 |
+
data_lists["filtered_data"][key] = value
|
46 |
+
# ---
|
47 |
+
key2 = key.replace(key_digits.group(), "2000")
|
48 |
+
value2 = value.replace(value_digits.group(), "2000")
|
49 |
+
# ---
|
50 |
+
# data_lists["cats_2000"].append({"en": key2, "ar": value2})
|
51 |
+
data_lists["cats_2000"][key] = value
|
52 |
+
|
53 |
+
|
54 |
+
for x, data_list in data_lists.items():
|
55 |
+
data_list = [{"en": key, "ar": value} for key, value in data_list.items()]
|
56 |
+
|
57 |
+
# حفظ القاموس المصحح في ملف JSON
|
58 |
+
with open(f"{x}.json", "w", encoding="utf-8") as f:
|
59 |
+
json.dump(data_list, f, ensure_ascii=False, indent=4)
|
60 |
|
61 |
+
upload_file(
|
62 |
+
path_or_fileobj=f"{x}.json", # اسم الملف الذي تم حفظه
|
63 |
+
path_in_repo=f"{x}.json", # المسار داخل المستودع
|
64 |
+
repo_id="Ibrahemqasim/enwiki_to_arwiki_categories", # معرف المستودع
|
65 |
+
# repo_type="dataset", # نوع المستودع (نستخدم dataset للملفات)
|
66 |
+
)
|
67 |
|
68 |
+
print(f"__________________")
|
69 |
+
print(f"file: {x} uploaded successfully!")
|
70 |
|
71 |
+
print(f"{len(data)=}.")
|
72 |
+
print(f"{len(data_list)} rows uploaded.")
|
|
|
|
|
|
|
|