Update fix langlinks.py
Browse files- fix langlinks.py +36 -49
fix langlinks.py
CHANGED
@@ -24,17 +24,15 @@ countries = response2.json()
|
|
24 |
# تحويل القاموس إلى قائمة من القواميس [{ "en": "value", "ar": "value" }, ...]
|
25 |
|
26 |
to_work = [
|
27 |
-
"
|
28 |
-
"
|
29 |
-
"
|
30 |
-
"cats_2000",
|
31 |
]
|
32 |
|
33 |
data_lists = {
|
34 |
-
"
|
35 |
-
"
|
36 |
-
"
|
37 |
-
"cats_2000" : {},
|
38 |
}
|
39 |
|
40 |
|
@@ -63,12 +61,6 @@ for tab in tqdm.tqdm(data):
|
|
63 |
if value.startswith(':"') and value.endswith('",'):
|
64 |
value = value[2:-2]
|
65 |
# ----
|
66 |
-
# data_lists["langlinks"].append({"en": key, "ar": value, "count": 0})
|
67 |
-
if key in data_lists["langlinks"]:
|
68 |
-
data_lists["langlinks"][key]["count"] += 1
|
69 |
-
else:
|
70 |
-
data_lists["langlinks"][key] = {"ar": value, "count": 0}
|
71 |
-
# ----
|
72 |
# Add if key and value has 4 digits and they are the same
|
73 |
reg_year = r"(\d+[–-]\d+|\d{4})"
|
74 |
# ---
|
@@ -76,21 +68,21 @@ for tab in tqdm.tqdm(data):
|
|
76 |
value_digits = re.search(reg_year, value)
|
77 |
# ----
|
78 |
if key_digits and value_digits and key_digits.group() == value_digits.group():
|
79 |
-
# data_lists["
|
80 |
-
if key in data_lists["
|
81 |
-
data_lists["
|
82 |
else:
|
83 |
-
data_lists["
|
84 |
# ---
|
85 |
key2 = key.replace(key_digits.group(), YEAR_PATTERN)
|
86 |
value2 = value.replace(value_digits.group(), YEAR_PATTERN)
|
87 |
# ---
|
88 |
-
# data_lists["
|
89 |
# ---
|
90 |
-
if key2 in data_lists["
|
91 |
-
data_lists["
|
92 |
else:
|
93 |
-
data_lists["
|
94 |
# ----
|
95 |
# البحث عن اسم الدولة في key2
|
96 |
match = re.search(regex_pattern, key2)
|
@@ -105,54 +97,49 @@ for tab in tqdm.tqdm(data):
|
|
105 |
# ---
|
106 |
if COUNTRY_PATTERN in key3 and COUNTRY_PATTERN in value3:
|
107 |
# ---
|
108 |
-
if key3 in data_lists["
|
109 |
-
data_lists["
|
110 |
else:
|
111 |
-
data_lists["
|
112 |
# ----
|
113 |
# ----
|
114 |
-
print(f"{len(data_lists['
|
115 |
-
print(f"{len(data_lists['
|
116 |
|
117 |
print(f"all data len: {len(data):,}.")
|
118 |
|
119 |
-
datasets_list = {
|
120 |
-
"langlinks" : "categories_en2ar",
|
121 |
-
"filtered_data" : "categories_en2ar_with_years",
|
122 |
-
"cats_2000_contry" : "categories_en2ar-cats_2000_contry",
|
123 |
-
"cats_2000" : "categories_en2ar-cats_2000",
|
124 |
-
}
|
125 |
-
|
126 |
# for x, data_list in data_lists.items():
|
127 |
for x in to_work:
|
128 |
data_list = data_lists.get(x)
|
129 |
# ---
|
130 |
data_list = [{"en": key, "ar": value["ar"], "count": value["count"]} for key, value in data_list.items()]
|
131 |
-
|
132 |
-
# حفظ القاموس المصحح في ملف JSON
|
133 |
-
with open(f"{x}.json", "w", encoding="utf-8") as f:
|
134 |
-
json.dump(data_list, f, ensure_ascii=False, indent=4)
|
135 |
-
|
136 |
print("______________")
|
137 |
-
print(f"file: {x} uploaded successfully!")
|
138 |
print(f"len of {x} : {len(data_list)}.")
|
139 |
# ---
|
140 |
# continue
|
141 |
# ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
upload_file(
|
143 |
path_or_fileobj=f"{x}.json", # اسم الملف الذي تم حفظه
|
144 |
path_in_repo=f"{x}.json", # المسار داخل المستودع
|
145 |
repo_id="Ibrahemqasim/enwiki_to_arwiki_categories", # معرف المستودع
|
146 |
# repo_type="dataset", # نوع المستودع (نستخدم dataset للملفات)
|
147 |
)
|
148 |
-
|
|
|
149 |
print("____________________________")
|
|
|
|
|
|
|
150 |
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
dataset = Dataset.from_list(data_list)
|
156 |
-
|
157 |
-
# رفع Dataset إلى Hugging Face
|
158 |
-
dataset.push_to_hub(f"Ibrahemqasim/{set_name}")
|
|
|
24 |
# تحويل القاموس إلى قائمة من القواميس [{ "en": "value", "ar": "value" }, ...]
|
25 |
|
26 |
to_work = [
|
27 |
+
"categories_with_years",
|
28 |
+
"categories_with_YEAR_COUNTRY_pattern",
|
29 |
+
"categories_with_YEAR_pattern",
|
|
|
30 |
]
|
31 |
|
32 |
data_lists = {
|
33 |
+
"categories_with_years" : {},
|
34 |
+
"categories_with_YEAR_COUNTRY_pattern" : {},
|
35 |
+
"categories_with_YEAR_pattern" : {},
|
|
|
36 |
}
|
37 |
|
38 |
|
|
|
61 |
if value.startswith(':"') and value.endswith('",'):
|
62 |
value = value[2:-2]
|
63 |
# ----
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
# Add if key and value has 4 digits and they are the same
|
65 |
reg_year = r"(\d+[–-]\d+|\d{4})"
|
66 |
# ---
|
|
|
68 |
value_digits = re.search(reg_year, value)
|
69 |
# ----
|
70 |
if key_digits and value_digits and key_digits.group() == value_digits.group():
|
71 |
+
# data_lists["categories_with_years"].append({"en": key, "ar": value, "count": 1})
|
72 |
+
if key in data_lists["categories_with_years"]:
|
73 |
+
data_lists["categories_with_years"][key]["count"] += 1
|
74 |
else:
|
75 |
+
data_lists["categories_with_years"][key] = {"ar": value, "count": 1}
|
76 |
# ---
|
77 |
key2 = key.replace(key_digits.group(), YEAR_PATTERN)
|
78 |
value2 = value.replace(value_digits.group(), YEAR_PATTERN)
|
79 |
# ---
|
80 |
+
# data_lists["categories_with_YEAR_pattern"].append({"en": key2, "ar": value2, "count": 1})
|
81 |
# ---
|
82 |
+
if key2 in data_lists["categories_with_YEAR_pattern"]:
|
83 |
+
data_lists["categories_with_YEAR_pattern"][key2]["count"] += 1
|
84 |
else:
|
85 |
+
data_lists["categories_with_YEAR_pattern"][key2] = {"ar": value2, "count": 1}
|
86 |
# ----
|
87 |
# البحث عن اسم الدولة في key2
|
88 |
match = re.search(regex_pattern, key2)
|
|
|
97 |
# ---
|
98 |
if COUNTRY_PATTERN in key3 and COUNTRY_PATTERN in value3:
|
99 |
# ---
|
100 |
+
if key3 in data_lists["categories_with_YEAR_COUNTRY_pattern"]:
|
101 |
+
data_lists["categories_with_YEAR_COUNTRY_pattern"][key3]["count"] += 1
|
102 |
else:
|
103 |
+
data_lists["categories_with_YEAR_COUNTRY_pattern"][key3] = {"ar": value3, "count": 1}
|
104 |
# ----
|
105 |
# ----
|
106 |
+
print(f"{len(data_lists['categories_with_YEAR_COUNTRY_pattern'])=}")
|
107 |
+
print(f"{len(data_lists['categories_with_YEAR_pattern'])=}")
|
108 |
|
109 |
print(f"all data len: {len(data):,}.")
|
110 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
# for x, data_list in data_lists.items():
|
112 |
for x in to_work:
|
113 |
data_list = data_lists.get(x)
|
114 |
# ---
|
115 |
data_list = [{"en": key, "ar": value["ar"], "count": value["count"]} for key, value in data_list.items()]
|
116 |
+
# ---
|
|
|
|
|
|
|
|
|
117 |
print("______________")
|
|
|
118 |
print(f"len of {x} : {len(data_list)}.")
|
119 |
# ---
|
120 |
# continue
|
121 |
# ---
|
122 |
+
'''
|
123 |
+
# حفظ القاموس المصحح في ملف JSON
|
124 |
+
with open(f"{x}.json", "w", encoding="utf-8") as f:
|
125 |
+
json.dump(data_list, f, ensure_ascii=False, indent=4)
|
126 |
+
# ---
|
127 |
+
print(f"file: {x} uploaded successfully!")
|
128 |
+
# ---
|
129 |
upload_file(
|
130 |
path_or_fileobj=f"{x}.json", # اسم الملف الذي تم حفظه
|
131 |
path_in_repo=f"{x}.json", # المسار داخل المستودع
|
132 |
repo_id="Ibrahemqasim/enwiki_to_arwiki_categories", # معرف المستودع
|
133 |
# repo_type="dataset", # نوع المستودع (نستخدم dataset للملفات)
|
134 |
)
|
135 |
+
'''
|
136 |
+
# ---
|
137 |
print("____________________________")
|
138 |
+
# ---
|
139 |
+
# إنشاء Dataset
|
140 |
+
dataset = Dataset.from_list(data_list)
|
141 |
|
142 |
+
# رفع Dataset إلى Hugging Face
|
143 |
+
dataset.push_to_hub(f"Ibrahemqasim/{x}")
|
144 |
+
# ---
|
145 |
+
print(f"dataset: Ibrahemqasim/{x} push_to_hub successfully!")
|
|
|
|
|
|
|
|