Ibrahemqasim
/

enwiki_to_arwiki_categories

Arabic

Wikipedia

Wikipedia_Categories

Model card Files Files and versions Community

Ibrahemqasim commited on May 5

Commit

55da6d3

verified ·

1 Parent(s): c4328ec

Update fix langlinks.py

Browse files

Files changed (1) hide show

fix langlinks.py +36 -49

fix langlinks.py CHANGED Viewed

@@ -24,17 +24,15 @@ countries = response2.json()
 # تحويل القاموس إلى قائمة من القواميس [{ "en": "value", "ar": "value" }, ...]
 to_work = [
-    "langlinks",
-    "filtered_data",
-    "cats_2000_contry",
-    "cats_2000",
 ]
 data_lists = {
-    "langlinks" : {},
-    "filtered_data" : {},
-    "cats_2000_contry" : {},
-    "cats_2000" : {},
 }
@@ -63,12 +61,6 @@ for tab in tqdm.tqdm(data):
     if value.startswith(':"') and value.endswith('",'):
         value = value[2:-2]
     # ----
-    # data_lists["langlinks"].append({"en": key, "ar": value, "count": 0})
-    if key in data_lists["langlinks"]:
-        data_lists["langlinks"][key]["count"] += 1
-    else:
-        data_lists["langlinks"][key] = {"ar": value, "count": 0}
-    # ----
     # Add if key and value has 4 digits and they are the same
     reg_year = r"(\d+[–-]\d+|\d{4})"
     # ---
@@ -76,21 +68,21 @@ for tab in tqdm.tqdm(data):
     value_digits = re.search(reg_year, value)
     # ----
     if key_digits and value_digits and key_digits.group() == value_digits.group():
-        # data_lists["filtered_data"].append({"en": key, "ar": value, "count": 0})
-        if key in data_lists["filtered_data"]:
-            data_lists["filtered_data"][key]["count"] += 1
         else:
-            data_lists["filtered_data"][key] = {"ar": value, "count": 0}
         # ---
         key2 = key.replace(key_digits.group(), YEAR_PATTERN)
         value2 = value.replace(value_digits.group(), YEAR_PATTERN)
         # ---
-        # data_lists["cats_2000"].append({"en": key2, "ar": value2, "count": 0})
         # ---
-        if key2 in data_lists["cats_2000"]:
-            data_lists["cats_2000"][key2]["count"] += 1
         else:
-            data_lists["cats_2000"][key2] = {"ar": value2, "count": 0}
         # ----
         # البحث عن اسم الدولة في key2
         match = re.search(regex_pattern, key2)
@@ -105,54 +97,49 @@ for tab in tqdm.tqdm(data):
                 # ---
                 if COUNTRY_PATTERN in key3 and COUNTRY_PATTERN in value3:
                     # ---
-                    if key3 in data_lists["cats_2000_contry"]:
-                        data_lists["cats_2000_contry"][key3]["count"] += 1
                     else:
-                        data_lists["cats_2000_contry"][key3] = {"ar": value3, "count": 0}
         # ----
 # ----
-print(f"{len(data_lists['cats_2000_contry'])=}")
-print(f"{len(data_lists['cats_2000'])=}")
 print(f"all data len: {len(data):,}.")
-datasets_list = {
-    "langlinks" : "categories_en2ar",
-    "filtered_data" : "categories_en2ar_with_years",
-    "cats_2000_contry" : "categories_en2ar-cats_2000_contry",
-    "cats_2000" : "categories_en2ar-cats_2000",
-}
 # for x, data_list in data_lists.items():
 for x in to_work:
     data_list = data_lists.get(x)
     # ---
     data_list = [{"en": key, "ar": value["ar"], "count": value["count"]} for key, value in data_list.items()]
-    # حفظ القاموس المصحح في ملف JSON
-    with open(f"{x}.json", "w", encoding="utf-8") as f:
-        json.dump(data_list, f, ensure_ascii=False, indent=4)
     print("______________")
-    print(f"file: {x} uploaded successfully!")
     print(f"len of {x} : {len(data_list)}.")
     # ---
     # continue
     # ---
     upload_file(
         path_or_fileobj=f"{x}.json",  # اسم الملف الذي تم حفظه
         path_in_repo=f"{x}.json",  # المسار داخل المستودع
         repo_id="Ibrahemqasim/enwiki_to_arwiki_categories",  # معرف المستودع
         # repo_type="dataset",  # نوع المستودع (نستخدم dataset للملفات)
     )
     print("____________________________")
-    set_name = datasets_list.get(x)
-    if set_name:
-        # إنشاء Dataset
-        dataset = Dataset.from_list(data_list)
-        # رفع Dataset إلى Hugging Face
-        dataset.push_to_hub(f"Ibrahemqasim/{set_name}")

 # تحويل القاموس إلى قائمة من القواميس [{ "en": "value", "ar": "value" }, ...]
 to_work = [
+    "categories_with_years",
+    "categories_with_YEAR_COUNTRY_pattern",
+    "categories_with_YEAR_pattern",
 ]
 data_lists = {
+    "categories_with_years" : {},
+    "categories_with_YEAR_COUNTRY_pattern" : {},
+    "categories_with_YEAR_pattern" : {},
 }
     if value.startswith(':"') and value.endswith('",'):
         value = value[2:-2]
     # ----
     # Add if key and value has 4 digits and they are the same
     reg_year = r"(\d+[–-]\d+|\d{4})"
     # ---
     value_digits = re.search(reg_year, value)
     # ----
     if key_digits and value_digits and key_digits.group() == value_digits.group():
+        # data_lists["categories_with_years"].append({"en": key, "ar": value, "count": 1})
+        if key in data_lists["categories_with_years"]:
+            data_lists["categories_with_years"][key]["count"] += 1
         else:
+            data_lists["categories_with_years"][key] = {"ar": value, "count": 1}
         # ---
         key2 = key.replace(key_digits.group(), YEAR_PATTERN)
         value2 = value.replace(value_digits.group(), YEAR_PATTERN)
         # ---
+        # data_lists["categories_with_YEAR_pattern"].append({"en": key2, "ar": value2, "count": 1})
         # ---
+        if key2 in data_lists["categories_with_YEAR_pattern"]:
+            data_lists["categories_with_YEAR_pattern"][key2]["count"] += 1
         else:
+            data_lists["categories_with_YEAR_pattern"][key2] = {"ar": value2, "count": 1}
         # ----
         # البحث عن اسم الدولة في key2
         match = re.search(regex_pattern, key2)
                 # ---
                 if COUNTRY_PATTERN in key3 and COUNTRY_PATTERN in value3:
                     # ---
+                    if key3 in data_lists["categories_with_YEAR_COUNTRY_pattern"]:
+                        data_lists["categories_with_YEAR_COUNTRY_pattern"][key3]["count"] += 1
                     else:
+                        data_lists["categories_with_YEAR_COUNTRY_pattern"][key3] = {"ar": value3, "count": 1}
         # ----
 # ----
+print(f"{len(data_lists['categories_with_YEAR_COUNTRY_pattern'])=}")
+print(f"{len(data_lists['categories_with_YEAR_pattern'])=}")
 print(f"all data len: {len(data):,}.")
 # for x, data_list in data_lists.items():
 for x in to_work:
     data_list = data_lists.get(x)
     # ---
     data_list = [{"en": key, "ar": value["ar"], "count": value["count"]} for key, value in data_list.items()]
+    # ---
     print("______________")
     print(f"len of {x} : {len(data_list)}.")
     # ---
     # continue
     # ---
+    '''
+    # حفظ القاموس المصحح في ملف JSON
+    with open(f"{x}.json", "w", encoding="utf-8") as f:
+        json.dump(data_list, f, ensure_ascii=False, indent=4)
+    # ---
+    print(f"file: {x} uploaded successfully!")
+    # ---
     upload_file(
         path_or_fileobj=f"{x}.json",  # اسم الملف الذي تم حفظه
         path_in_repo=f"{x}.json",  # المسار داخل المستودع
         repo_id="Ibrahemqasim/enwiki_to_arwiki_categories",  # معرف المستودع
         # repo_type="dataset",  # نوع المستودع (نستخدم dataset للملفات)
     )
+    '''
+    # ---
     print("____________________________")
+    # ---
+    # إنشاء Dataset
+    dataset = Dataset.from_list(data_list)
+    # رفع Dataset إلى Hugging Face
+    dataset.push_to_hub(f"Ibrahemqasim/{x}")
+    # ---
+    print(f"dataset: Ibrahemqasim/{x} push_to_hub successfully!")