Ibrahemqasim
/

enwiki_to_arwiki_categories

Arabic

Wikipedia

Wikipedia_Categories

Model card Files Files and versions Community

Ibrahemqasim commited on May 5

Commit

b5fe413

verified ·

1 Parent(s): 46e3596

Update fix langlinks.py

Browse files

Files changed (1) hide show

fix langlinks.py +41 -30

fix langlinks.py CHANGED Viewed

@@ -24,10 +24,10 @@ countries = response2.json()
 # تحويل القاموس إلى قائمة من القواميس [{ "en": "value", "ar": "value" }, ...]
 to_work = [
-    # "langlinks",
-    # "filtered_data",
     "cats_2000_contry",
-    # "cats_2000",
 ]
 data_lists = {
@@ -44,6 +44,8 @@ sorted_keys = sorted(countries.keys(), key=lambda x: -x.count(' '))
 # نبني تعبير regex
 regex_pattern = r'\b(' + '|'.join(map(re.escape, sorted_keys)) + r')\b'
 for tab in tqdm.tqdm(data):
     # ---
@@ -61,22 +63,34 @@ for tab in tqdm.tqdm(data):
     if value.startswith(':"') and value.endswith('",'):
         value = value[2:-2]
     # ----
-    # data_lists["langlinks"].append({"en": key, "ar": value})
-    data_lists["langlinks"][key] = value
     # ----
     # Add if key and value has 4 digits and they are the same
-    key_digits = re.search(r"\d{4}", key)
-    value_digits = re.search(r"\d{4}", value)
     # ----
     if key_digits and value_digits and key_digits.group() == value_digits.group():
-        # data_lists["filtered_data"].append({"en": key, "ar": value})
-        data_lists["filtered_data"][key] = value
         # ---
-        key2 = key.replace(key_digits.group(), "2000")
-        value2 = value.replace(value_digits.group(), "2000")
         # ---
-        # data_lists["cats_2000"].append({"en": key2, "ar": value2})
-        data_lists["cats_2000"][key2] = value2
         # ----
         # البحث عن اسم الدولة في key2
         match = re.search(regex_pattern, key2)
@@ -86,24 +100,19 @@ for tab in tqdm.tqdm(data):
             ar_country = countries.get(en_country)
             # ---
             if ar_country and ar_country in value2:
-                key3 = re.sub(rf'\b{re.escape(en_country)}\b', 'country', key2)
-                value3 = re.sub(rf'\b{re.escape(ar_country)}\b', 'country', value2)
                 # ---
-                if key3 not in data_lists["cats_2000_contry"]:
-                    print(f"{key3} → {value3}")
                     # ---
-                    data_lists["cats_2000_contry"][key3] = value3
-        # ----
-        """
-        for en_c, ar_c in countries.items():
-            if en_c in key2 and ar_c in value2:
-                key3 = key2.replace(en_c, "country")
-                value3 = value2.replace(ar_c, "country")
-                # ---
-                data_lists["cats_2000_contry"][key3] = value3
-        """
         # ----
 print(f"all data len: {len(data):,}.")
@@ -118,7 +127,7 @@ datasets_list = {
 for x in to_work:
     data_list = data_lists.get(x)
     # ---
-    data_list = [{"en": key, "ar": value} for key, value in data_list.items()]
     # حفظ القاموس المصحح في ملف JSON
     with open(f"{x}.json", "w", encoding="utf-8") as f:
@@ -127,7 +136,9 @@ for x in to_work:
     print("______________")
     print(f"file: {x} uploaded successfully!")
     print(f"len of {x} : {len(data_list)}.")
-    #continue
     upload_file(
         path_or_fileobj=f"{x}.json",  # اسم الملف الذي تم حفظه
         path_in_repo=f"{x}.json",  # المسار داخل المستودع

 # تحويل القاموس إلى قائمة من القواميس [{ "en": "value", "ar": "value" }, ...]
 to_work = [
+    "langlinks",
+    "filtered_data",
     "cats_2000_contry",
+    "cats_2000",
 ]
 data_lists = {
 # نبني تعبير regex
 regex_pattern = r'\b(' + '|'.join(map(re.escape, sorted_keys)) + r')\b'
+YEAR_PATTERN = "{YEAR}"
+COUNTRY_PATTERN = "{COUNTRY}"
 for tab in tqdm.tqdm(data):
     # ---
     if value.startswith(':"') and value.endswith('",'):
         value = value[2:-2]
     # ----
+    # data_lists["langlinks"].append({"en": key, "ar": value, "count": 0})
+    if key in data_lists["langlinks"]:
+        data_lists["langlinks"][key]["count"] += 1
+    else:
+        data_lists["langlinks"][key] = {"ar": value, "count": 0}
     # ----
     # Add if key and value has 4 digits and they are the same
+    reg_year = r"(\d+[–-]\d+|\d{4})"
+    # ---
+    key_digits = re.search(reg_year, key)
+    value_digits = re.search(reg_year, value)
     # ----
     if key_digits and value_digits and key_digits.group() == value_digits.group():
+        # data_lists["filtered_data"].append({"en": key, "ar": value, "count": 0})
+        if key in data_lists["filtered_data"]:
+            data_lists["filtered_data"][key]["count"] += 1
+        else:
+            data_lists["filtered_data"][key] = {"ar": value, "count": 0}
+        # ---
+        key2 = key.replace(key_digits.group(), YEAR_PATTERN)
+        value2 = value.replace(value_digits.group(), YEAR_PATTERN)
         # ---
+        # data_lists["cats_2000"].append({"en": key2, "ar": value2, "count": 0})
         # ---
+        if key2 in data_lists["cats_2000"]:
+            data_lists["cats_2000"][key2]["count"] += 1
+        else:
+            data_lists["cats_2000"][key2] = {"ar": value2, "count": 0}
         # ----
         # البحث عن اسم الدولة في key2
         match = re.search(regex_pattern, key2)
             ar_country = countries.get(en_country)
             # ---
             if ar_country and ar_country in value2:
+                key3 = re.sub(rf'\b{re.escape(en_country)}\b', COUNTRY_PATTERN, key2)
+                value3 = re.sub(rf'\b{re.escape(ar_country)}\b', COUNTRY_PATTERN, value2)
                 # ---
+                if COUNTRY_PATTERN in key3 and COUNTRY_PATTERN in value3:
                     # ---
+                    if key3 in data_lists["cats_2000_contry"]:
+                        data_lists["cats_2000_contry"][key3]["count"] += 1
+                    else:
+                        data_lists["cats_2000_contry"][key3] = {"ar": value3, "count": 0}
         # ----
+# ----
+print(f"{len(data_lists['cats_2000_contry'])=}")
+print(f"{len(data_lists['cats_2000'])=}")
 print(f"all data len: {len(data):,}.")
 for x in to_work:
     data_list = data_lists.get(x)
     # ---
+    data_list = [{"en": key, "ar": value["ar"], "count": value["count"]} for key, value in data_list.items()]
     # حفظ القاموس المصحح في ملف JSON
     with open(f"{x}.json", "w", encoding="utf-8") as f:
     print("______________")
     print(f"file: {x} uploaded successfully!")
     print(f"len of {x} : {len(data_list)}.")
+    # ---
+    # continue
+    # ---
     upload_file(
         path_or_fileobj=f"{x}.json",  # اسم الملف الذي تم حفظه
         path_in_repo=f"{x}.json",  # المسار داخل المستودع