Ibrahemqasim
/

enwiki_to_arwiki_categories

Arabic

Wikipedia

Wikipedia_Categories

Model card Files Files and versions

xet

Community

Ibrahemqasim commited on May 5

Commit

675327e

verified ·

1 Parent(s): 440236c

Update fix langlinks.py

Browse files

Files changed (1) hide show

fix langlinks.py +34 -5

fix langlinks.py CHANGED Viewed

@@ -6,8 +6,10 @@ from huggingface_hub import login
 from huggingface_hub import upload_file
 from datasets import Dataset
 # تسجيل الدخول إلى Hugging Face (استبدل "YOUR_ACCESS_TOKEN" بالتوكن الخاص بك)
-login("YOUR_ACCESS_TOKEN")
 # تحميل الملف JSON من الرابط مباشرة
 json_url = "https://huggingface.co/Ibrahemqasim/enwiki_to_arwiki_categories/resolve/main/langlinks.json"
@@ -24,8 +26,8 @@ countries = response2.json()
 to_work = [
     # "langlinks",
     # "filtered_data",
-    # "cats_2000_contry",
-    "cats_2000",
 ]
 data_lists = {
@@ -35,6 +37,14 @@ data_lists = {
     "cats_2000" : {},
 }
 for tab in tqdm.tqdm(data):
     # ---
     key = tab["en"]
@@ -68,12 +78,30 @@ for tab in tqdm.tqdm(data):
         # data_lists["cats_2000"].append({"en": key2, "ar": value2})
         data_lists["cats_2000"][key2] = value2
         # ----
         for en_c, ar_c in countries.items():
             if en_c in key2 and ar_c in value2:
                 key3 = key2.replace(en_c, "country")
                 value3 = value2.replace(ar_c, "country")
                 # ---
                 data_lists["cats_2000_contry"][key3] = value3
 print(f"all data len: {len(data):,}.")
@@ -85,6 +113,7 @@ datasets_list = {
     "cats_2000" : "categories_en2ar-cats_2000",
 }
 # for x, data_list in data_lists.items():
 for x in to_work:
     data_list = data_lists.get(x)
@@ -98,7 +127,7 @@ for x in to_work:
     print("______________")
     print(f"file: {x} uploaded successfully!")
     print(f"len of {x} : {len(data_list)}.")
     upload_file(
         path_or_fileobj=f"{x}.json",  # اسم الملف الذي تم حفظه
         path_in_repo=f"{x}.json",  # المسار داخل المستودع
@@ -115,4 +144,4 @@ for x in to_work:
         dataset = Dataset.from_list(data_list)
         # رفع Dataset إلى Hugging Face
-        dataset.push_to_hub(f"Ibrahemqasim/{set_name}")

 from huggingface_hub import upload_file
 from datasets import Dataset
+from google.colab import userdata
 # تسجيل الدخول إلى Hugging Face (استبدل "YOUR_ACCESS_TOKEN" بالتوكن الخاص بك)
+login(userdata.get('HF_API'))
 # تحميل الملف JSON من الرابط مباشرة
 json_url = "https://huggingface.co/Ibrahemqasim/enwiki_to_arwiki_categories/resolve/main/langlinks.json"
 to_work = [
     # "langlinks",
     # "filtered_data",
+    "cats_2000_contry",
+    # "cats_2000",
 ]
 data_lists = {
     "cats_2000" : {},
 }
+# نرتب الدول حسب عدد الفراغات (تنازليًا)
+sorted_keys = sorted(countries.keys(), key=lambda x: -x.count(' '))
+# نبني تعبير regex
+regex_pattern = r'\b(' + '|'.join(map(re.escape, sorted_keys)) + r')\b'
 for tab in tqdm.tqdm(data):
     # ---
     key = tab["en"]
         # data_lists["cats_2000"].append({"en": key2, "ar": value2})
         data_lists["cats_2000"][key2] = value2
         # ----
+        # البحث عن اسم الدولة في key2
+        match = re.search(regex_pattern, key2)
+        # ----
+        if match:
+            en_country = match.group(1)
+            ar_country = countries.get(en_country)
+            # ---
+            if ar_country and ar_country in value2:
+                key3 = re.sub(rf'\b{re.escape(en_country)}\b', 'country', key2)
+                value3 = re.sub(rf'\b{re.escape(ar_country)}\b', 'country', value2)
+                # ---
+                print(f"{key3} → {value3}")
+                # ---
+                data_lists["cats_2000_contry"][key3] = value3
+        # ----
+        """
         for en_c, ar_c in countries.items():
             if en_c in key2 and ar_c in value2:
                 key3 = key2.replace(en_c, "country")
                 value3 = value2.replace(ar_c, "country")
                 # ---
                 data_lists["cats_2000_contry"][key3] = value3
+        """
+        # ----
 print(f"all data len: {len(data):,}.")
     "cats_2000" : "categories_en2ar-cats_2000",
 }
 # for x, data_list in data_lists.items():
 for x in to_work:
     data_list = data_lists.get(x)
     print("______________")
     print(f"file: {x} uploaded successfully!")
     print(f"len of {x} : {len(data_list)}.")
+    continue
     upload_file(
         path_or_fileobj=f"{x}.json",  # اسم الملف الذي تم حفظه
         path_in_repo=f"{x}.json",  # المسار داخل المستودع
         dataset = Dataset.from_list(data_list)
         # رفع Dataset إلى Hugging Face
+        dataset.push_to_hub(f"Ibrahemqasim/{set_name}")