Update fix langlinks.py
Browse files- fix langlinks.py +41 -30
fix langlinks.py
CHANGED
@@ -24,10 +24,10 @@ countries = response2.json()
|
|
24 |
# تحويل القاموس إلى قائمة من القواميس [{ "en": "value", "ar": "value" }, ...]
|
25 |
|
26 |
to_work = [
|
27 |
-
|
28 |
-
|
29 |
"cats_2000_contry",
|
30 |
-
|
31 |
]
|
32 |
|
33 |
data_lists = {
|
@@ -44,6 +44,8 @@ sorted_keys = sorted(countries.keys(), key=lambda x: -x.count(' '))
|
|
44 |
# نبني تعبير regex
|
45 |
regex_pattern = r'\b(' + '|'.join(map(re.escape, sorted_keys)) + r')\b'
|
46 |
|
|
|
|
|
47 |
|
48 |
for tab in tqdm.tqdm(data):
|
49 |
# ---
|
@@ -61,22 +63,34 @@ for tab in tqdm.tqdm(data):
|
|
61 |
if value.startswith(':"') and value.endswith('",'):
|
62 |
value = value[2:-2]
|
63 |
# ----
|
64 |
-
# data_lists["langlinks"].append({"en": key, "ar": value})
|
65 |
-
data_lists["langlinks"]
|
|
|
|
|
|
|
66 |
# ----
|
67 |
# Add if key and value has 4 digits and they are the same
|
68 |
-
|
69 |
-
|
|
|
|
|
70 |
# ----
|
71 |
if key_digits and value_digits and key_digits.group() == value_digits.group():
|
72 |
-
# data_lists["filtered_data"].append({"en": key, "ar": value})
|
73 |
-
data_lists["filtered_data"]
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
# ---
|
75 |
-
|
76 |
-
value2 = value.replace(value_digits.group(), "2000")
|
77 |
# ---
|
78 |
-
|
79 |
-
|
|
|
|
|
80 |
# ----
|
81 |
# البحث عن اسم الدولة في key2
|
82 |
match = re.search(regex_pattern, key2)
|
@@ -86,24 +100,19 @@ for tab in tqdm.tqdm(data):
|
|
86 |
ar_country = countries.get(en_country)
|
87 |
# ---
|
88 |
if ar_country and ar_country in value2:
|
89 |
-
key3 = re.sub(rf'\b{re.escape(en_country)}\b',
|
90 |
-
value3 = re.sub(rf'\b{re.escape(ar_country)}\b',
|
91 |
# ---
|
92 |
-
if key3
|
93 |
-
print(f"{key3} → {value3}")
|
94 |
# ---
|
95 |
-
data_lists["cats_2000_contry"]
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
if en_c in key2 and ar_c in value2:
|
100 |
-
key3 = key2.replace(en_c, "country")
|
101 |
-
value3 = value2.replace(ar_c, "country")
|
102 |
-
# ---
|
103 |
-
data_lists["cats_2000_contry"][key3] = value3
|
104 |
-
"""
|
105 |
# ----
|
106 |
-
|
|
|
|
|
107 |
|
108 |
print(f"all data len: {len(data):,}.")
|
109 |
|
@@ -118,7 +127,7 @@ datasets_list = {
|
|
118 |
for x in to_work:
|
119 |
data_list = data_lists.get(x)
|
120 |
# ---
|
121 |
-
data_list = [{"en": key, "ar": value} for key, value in data_list.items()]
|
122 |
|
123 |
# حفظ القاموس المصحح في ملف JSON
|
124 |
with open(f"{x}.json", "w", encoding="utf-8") as f:
|
@@ -127,7 +136,9 @@ for x in to_work:
|
|
127 |
print("______________")
|
128 |
print(f"file: {x} uploaded successfully!")
|
129 |
print(f"len of {x} : {len(data_list)}.")
|
130 |
-
#
|
|
|
|
|
131 |
upload_file(
|
132 |
path_or_fileobj=f"{x}.json", # اسم الملف الذي تم حفظه
|
133 |
path_in_repo=f"{x}.json", # المسار داخل المستودع
|
|
|
24 |
# تحويل القاموس إلى قائمة من القواميس [{ "en": "value", "ar": "value" }, ...]
|
25 |
|
26 |
to_work = [
|
27 |
+
"langlinks",
|
28 |
+
"filtered_data",
|
29 |
"cats_2000_contry",
|
30 |
+
"cats_2000",
|
31 |
]
|
32 |
|
33 |
data_lists = {
|
|
|
44 |
# نبني تعبير regex
|
45 |
regex_pattern = r'\b(' + '|'.join(map(re.escape, sorted_keys)) + r')\b'
|
46 |
|
47 |
+
YEAR_PATTERN = "{YEAR}"
|
48 |
+
COUNTRY_PATTERN = "{COUNTRY}"
|
49 |
|
50 |
for tab in tqdm.tqdm(data):
|
51 |
# ---
|
|
|
63 |
if value.startswith(':"') and value.endswith('",'):
|
64 |
value = value[2:-2]
|
65 |
# ----
|
66 |
+
# data_lists["langlinks"].append({"en": key, "ar": value, "count": 0})
|
67 |
+
if key in data_lists["langlinks"]:
|
68 |
+
data_lists["langlinks"][key]["count"] += 1
|
69 |
+
else:
|
70 |
+
data_lists["langlinks"][key] = {"ar": value, "count": 0}
|
71 |
# ----
|
72 |
# Add if key and value has 4 digits and they are the same
|
73 |
+
reg_year = r"(\d+[–-]\d+|\d{4})"
|
74 |
+
# ---
|
75 |
+
key_digits = re.search(reg_year, key)
|
76 |
+
value_digits = re.search(reg_year, value)
|
77 |
# ----
|
78 |
if key_digits and value_digits and key_digits.group() == value_digits.group():
|
79 |
+
# data_lists["filtered_data"].append({"en": key, "ar": value, "count": 0})
|
80 |
+
if key in data_lists["filtered_data"]:
|
81 |
+
data_lists["filtered_data"][key]["count"] += 1
|
82 |
+
else:
|
83 |
+
data_lists["filtered_data"][key] = {"ar": value, "count": 0}
|
84 |
+
# ---
|
85 |
+
key2 = key.replace(key_digits.group(), YEAR_PATTERN)
|
86 |
+
value2 = value.replace(value_digits.group(), YEAR_PATTERN)
|
87 |
# ---
|
88 |
+
# data_lists["cats_2000"].append({"en": key2, "ar": value2, "count": 0})
|
|
|
89 |
# ---
|
90 |
+
if key2 in data_lists["cats_2000"]:
|
91 |
+
data_lists["cats_2000"][key2]["count"] += 1
|
92 |
+
else:
|
93 |
+
data_lists["cats_2000"][key2] = {"ar": value2, "count": 0}
|
94 |
# ----
|
95 |
# البحث عن اسم الدولة في key2
|
96 |
match = re.search(regex_pattern, key2)
|
|
|
100 |
ar_country = countries.get(en_country)
|
101 |
# ---
|
102 |
if ar_country and ar_country in value2:
|
103 |
+
key3 = re.sub(rf'\b{re.escape(en_country)}\b', COUNTRY_PATTERN, key2)
|
104 |
+
value3 = re.sub(rf'\b{re.escape(ar_country)}\b', COUNTRY_PATTERN, value2)
|
105 |
# ---
|
106 |
+
if COUNTRY_PATTERN in key3 and COUNTRY_PATTERN in value3:
|
|
|
107 |
# ---
|
108 |
+
if key3 in data_lists["cats_2000_contry"]:
|
109 |
+
data_lists["cats_2000_contry"][key3]["count"] += 1
|
110 |
+
else:
|
111 |
+
data_lists["cats_2000_contry"][key3] = {"ar": value3, "count": 0}
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
# ----
|
113 |
+
# ----
|
114 |
+
print(f"{len(data_lists['cats_2000_contry'])=}")
|
115 |
+
print(f"{len(data_lists['cats_2000'])=}")
|
116 |
|
117 |
print(f"all data len: {len(data):,}.")
|
118 |
|
|
|
127 |
for x in to_work:
|
128 |
data_list = data_lists.get(x)
|
129 |
# ---
|
130 |
+
data_list = [{"en": key, "ar": value["ar"], "count": value["count"]} for key, value in data_list.items()]
|
131 |
|
132 |
# حفظ القاموس المصحح في ملف JSON
|
133 |
with open(f"{x}.json", "w", encoding="utf-8") as f:
|
|
|
136 |
print("______________")
|
137 |
print(f"file: {x} uploaded successfully!")
|
138 |
print(f"len of {x} : {len(data_list)}.")
|
139 |
+
# ---
|
140 |
+
# continue
|
141 |
+
# ---
|
142 |
upload_file(
|
143 |
path_or_fileobj=f"{x}.json", # اسم الملف الذي تم حفظه
|
144 |
path_in_repo=f"{x}.json", # المسار داخل المستودع
|