Update fix langlinks.py
Browse files- fix langlinks.py +34 -5
fix langlinks.py
CHANGED
@@ -6,8 +6,10 @@ from huggingface_hub import login
|
|
6 |
from huggingface_hub import upload_file
|
7 |
from datasets import Dataset
|
8 |
|
|
|
|
|
9 |
# تسجيل الدخول إلى Hugging Face (استبدل "YOUR_ACCESS_TOKEN" بالتوكن الخاص بك)
|
10 |
-
login(
|
11 |
|
12 |
# تحميل الملف JSON من الرابط مباشرة
|
13 |
json_url = "https://huggingface.co/Ibrahemqasim/enwiki_to_arwiki_categories/resolve/main/langlinks.json"
|
@@ -24,8 +26,8 @@ countries = response2.json()
|
|
24 |
to_work = [
|
25 |
# "langlinks",
|
26 |
# "filtered_data",
|
27 |
-
|
28 |
-
"cats_2000",
|
29 |
]
|
30 |
|
31 |
data_lists = {
|
@@ -35,6 +37,14 @@ data_lists = {
|
|
35 |
"cats_2000" : {},
|
36 |
}
|
37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
for tab in tqdm.tqdm(data):
|
39 |
# ---
|
40 |
key = tab["en"]
|
@@ -68,12 +78,30 @@ for tab in tqdm.tqdm(data):
|
|
68 |
# data_lists["cats_2000"].append({"en": key2, "ar": value2})
|
69 |
data_lists["cats_2000"][key2] = value2
|
70 |
# ----
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
for en_c, ar_c in countries.items():
|
72 |
if en_c in key2 and ar_c in value2:
|
73 |
key3 = key2.replace(en_c, "country")
|
74 |
value3 = value2.replace(ar_c, "country")
|
75 |
# ---
|
76 |
data_lists["cats_2000_contry"][key3] = value3
|
|
|
|
|
77 |
|
78 |
|
79 |
print(f"all data len: {len(data):,}.")
|
@@ -85,6 +113,7 @@ datasets_list = {
|
|
85 |
"cats_2000" : "categories_en2ar-cats_2000",
|
86 |
}
|
87 |
|
|
|
88 |
# for x, data_list in data_lists.items():
|
89 |
for x in to_work:
|
90 |
data_list = data_lists.get(x)
|
@@ -98,7 +127,7 @@ for x in to_work:
|
|
98 |
print("______________")
|
99 |
print(f"file: {x} uploaded successfully!")
|
100 |
print(f"len of {x} : {len(data_list)}.")
|
101 |
-
|
102 |
upload_file(
|
103 |
path_or_fileobj=f"{x}.json", # اسم الملف الذي تم حفظه
|
104 |
path_in_repo=f"{x}.json", # المسار داخل المستودع
|
@@ -115,4 +144,4 @@ for x in to_work:
|
|
115 |
dataset = Dataset.from_list(data_list)
|
116 |
|
117 |
# رفع Dataset إلى Hugging Face
|
118 |
-
dataset.push_to_hub(f"Ibrahemqasim/{set_name}")
|
|
|
6 |
from huggingface_hub import upload_file
|
7 |
from datasets import Dataset
|
8 |
|
9 |
+
from google.colab import userdata
|
10 |
+
|
11 |
# تسجيل الدخول إلى Hugging Face (استبدل "YOUR_ACCESS_TOKEN" بالتوكن الخاص بك)
|
12 |
+
login(userdata.get('HF_API'))
|
13 |
|
14 |
# تحميل الملف JSON من الرابط مباشرة
|
15 |
json_url = "https://huggingface.co/Ibrahemqasim/enwiki_to_arwiki_categories/resolve/main/langlinks.json"
|
|
|
26 |
to_work = [
|
27 |
# "langlinks",
|
28 |
# "filtered_data",
|
29 |
+
"cats_2000_contry",
|
30 |
+
# "cats_2000",
|
31 |
]
|
32 |
|
33 |
data_lists = {
|
|
|
37 |
"cats_2000" : {},
|
38 |
}
|
39 |
|
40 |
+
|
41 |
+
# نرتب الدول حسب عدد الفراغات (تنازليًا)
|
42 |
+
sorted_keys = sorted(countries.keys(), key=lambda x: -x.count(' '))
|
43 |
+
|
44 |
+
# نبني تعبير regex
|
45 |
+
regex_pattern = r'\b(' + '|'.join(map(re.escape, sorted_keys)) + r')\b'
|
46 |
+
|
47 |
+
|
48 |
for tab in tqdm.tqdm(data):
|
49 |
# ---
|
50 |
key = tab["en"]
|
|
|
78 |
# data_lists["cats_2000"].append({"en": key2, "ar": value2})
|
79 |
data_lists["cats_2000"][key2] = value2
|
80 |
# ----
|
81 |
+
# البحث عن اسم الدولة في key2
|
82 |
+
match = re.search(regex_pattern, key2)
|
83 |
+
# ----
|
84 |
+
if match:
|
85 |
+
en_country = match.group(1)
|
86 |
+
ar_country = countries.get(en_country)
|
87 |
+
# ---
|
88 |
+
if ar_country and ar_country in value2:
|
89 |
+
key3 = re.sub(rf'\b{re.escape(en_country)}\b', 'country', key2)
|
90 |
+
value3 = re.sub(rf'\b{re.escape(ar_country)}\b', 'country', value2)
|
91 |
+
# ---
|
92 |
+
print(f"{key3} → {value3}")
|
93 |
+
# ---
|
94 |
+
data_lists["cats_2000_contry"][key3] = value3
|
95 |
+
# ----
|
96 |
+
"""
|
97 |
for en_c, ar_c in countries.items():
|
98 |
if en_c in key2 and ar_c in value2:
|
99 |
key3 = key2.replace(en_c, "country")
|
100 |
value3 = value2.replace(ar_c, "country")
|
101 |
# ---
|
102 |
data_lists["cats_2000_contry"][key3] = value3
|
103 |
+
"""
|
104 |
+
# ----
|
105 |
|
106 |
|
107 |
print(f"all data len: {len(data):,}.")
|
|
|
113 |
"cats_2000" : "categories_en2ar-cats_2000",
|
114 |
}
|
115 |
|
116 |
+
|
117 |
# for x, data_list in data_lists.items():
|
118 |
for x in to_work:
|
119 |
data_list = data_lists.get(x)
|
|
|
127 |
print("______________")
|
128 |
print(f"file: {x} uploaded successfully!")
|
129 |
print(f"len of {x} : {len(data_list)}.")
|
130 |
+
continue
|
131 |
upload_file(
|
132 |
path_or_fileobj=f"{x}.json", # اسم الملف الذي تم حفظه
|
133 |
path_in_repo=f"{x}.json", # المسار داخل المستودع
|
|
|
144 |
dataset = Dataset.from_list(data_list)
|
145 |
|
146 |
# رفع Dataset إلى Hugging Face
|
147 |
+
dataset.push_to_hub(f"Ibrahemqasim/{set_name}")
|