File size: 5,730 Bytes

8dc5272
7006d9b
8dc5272
 
 
 
d6d2307
8dc5272
675327e
 
8dc5272
6e8a74c
8dc5272
 
 
 
 
 
f012e57
 
 
 
 
8dc5272
7006d9b
4cf98f3
a3e5844
55da6d3
 
 
4cf98f3
 
7006d9b
a3e5844
55da6d3
 
 
7006d9b
 
675327e
 
 
a3e5844
675327e
 
 
b5fe413
 
675327e
f012e57
 
 
 
 
8dc5272
 
 
 
 
 
 
 
 
 
 
7006d9b
b5fe413
 
 
 
7006d9b
 
55da6d3
 
 
b5fe413
55da6d3
b5fe413
 
 
7006d9b
55da6d3
7006d9b
55da6d3
 
b5fe413
55da6d3
f012e57
675327e
 
 
 
 
 
 
 
b5fe413
 
675327e
b5fe413
46e3596
55da6d3
 
b5fe413
55da6d3
675327e
b5fe413
55da6d3
 
7006d9b
a814d50
 
4cf98f3
 
 
 
a3e5844
 
 
 
dadb4f6
 
55da6d3
a814d50
 
b5fe413
 
 
55da6d3
 
 
 
 
 
 
7006d9b
 
 
 
 
 
55da6d3
 
4cf98f3
55da6d3
 
 
d6d2307
55da6d3

import tqdm
import re
import json
import requests
from huggingface_hub import login
from huggingface_hub import upload_file
from datasets import Dataset

from google.colab import userdata

# تسجيل الدخول إلى Hugging Face (استبدل "YOUR_ACCESS_TOKEN" بالتوكن الخاص بك)
login(userdata.get('HF_TOKEN'))

# تحميل الملف JSON من الرابط مباشرة
json_url = "https://huggingface.co/Ibrahemqasim/enwiki_to_arwiki_categories/resolve/main/langlinks.json"
response = requests.get(json_url)
data = response.json()

# تحميل الملف JSON من الرابط مباشرة
json_url2 = "https://huggingface.co/Ibrahemqasim/enwiki_to_arwiki_categories/resolve/main/countries.json"
response2 = requests.get(json_url2)
countries = response2.json()

# تحويل القاموس إلى قائمة من القواميس [{ "en": "value", "ar": "value" }, ...]

to_work = [
    "countries",
    "categories_with_years",
    "categories_with_YEAR_COUNTRY_pattern",
    "categories_with_YEAR_pattern",
]

data_lists = {
    "countries" : {},
    "categories_with_years" : {},
    "categories_with_YEAR_COUNTRY_pattern" : {},
    "categories_with_YEAR_pattern" : {},
}


# نرتب الدول حسب عدد الفراغات (تنازليًا)
sorted_keys = sorted(countries.keys(), key=lambda x: -x.count(' '))
data_lists["countries"] = dict(sorted(countries.items(), key=lambda x: -x[1].count(' ')))
# نبني تعبير regex
regex_pattern = r'\b(' + '|'.join(map(re.escape, sorted_keys)) + r')\b'

YEAR_PATTERN = "{YEAR}"
COUNTRY_PATTERN = "{COUNTRY}"

for tab in tqdm.tqdm(data):
    # ---
    key = tab["en"]
    value = tab["ar"]
    # ---
    # "Category:1. FC Köln non-playing staff"
    # remove " from start and end
    # ---
    if key.startswith('"') and key.endswith('"'):
        key = key[1:-1]
    # ----
    # remove (:") from start and remove (",) from end
    # :"cc",
    if value.startswith(':"') and value.endswith('",'):
        value = value[2:-2]
    # ----
    # Add if key and value has 4 digits and they are the same
    reg_year = r"(\d+[–-]\d+|\d{4})"
    # ---
    key_digits = re.search(reg_year, key)
    value_digits = re.search(reg_year, value)
    # ----
    if key_digits and value_digits and key_digits.group() == value_digits.group():
        # data_lists["categories_with_years"].append({"en": key, "ar": value, "count": 1})
        if key in data_lists["categories_with_years"]:
            data_lists["categories_with_years"][key]["count"] += 1
        else:
            data_lists["categories_with_years"][key] = {"ar": value, "count": 1}
        # ---
        key2 = key.replace(key_digits.group(), YEAR_PATTERN)
        value2 = value.replace(value_digits.group(), YEAR_PATTERN)
        # ---
        # data_lists["categories_with_YEAR_pattern"].append({"en": key2, "ar": value2, "count": 1})
        # ---
        if key2 in data_lists["categories_with_YEAR_pattern"]:
            data_lists["categories_with_YEAR_pattern"][key2]["count"] += 1
        else:
            data_lists["categories_with_YEAR_pattern"][key2] = {"ar": value2, "count": 1}
        # ----
        # البحث عن اسم الدولة في key2
        match = re.search(regex_pattern, key2)
        # ----
        if match:
            en_country = match.group(1)
            ar_country = countries.get(en_country)
            # ---
            if ar_country and ar_country in value2:
                key3 = re.sub(rf'\b{re.escape(en_country)}\b', COUNTRY_PATTERN, key2)
                value3 = re.sub(rf'\b{re.escape(ar_country)}\b', COUNTRY_PATTERN, value2)
                # ---
                if COUNTRY_PATTERN in key3 and COUNTRY_PATTERN in value3:
                    # ---
                    if key3 in data_lists["categories_with_YEAR_COUNTRY_pattern"]:
                        data_lists["categories_with_YEAR_COUNTRY_pattern"][key3]["count"] += 1
                    else:
                        data_lists["categories_with_YEAR_COUNTRY_pattern"][key3] = {"ar": value3, "count": 1}
        # ----
# ----
print(f"{len(data_lists['categories_with_YEAR_COUNTRY_pattern'])=}")
print(f"{len(data_lists['categories_with_YEAR_pattern'])=}")

print(f"all data len: {len(data):,}.")

# for x, data_list in data_lists.items():
for x in to_work:
    data_list = data_lists.get(x)
    # ---
    if x == "countries":
        data_list = [{"en": key, "ar": value} for key, value in data_list.items()]
    else:
        data_list = [{"en": key, "ar": value["ar"], "count": value["count"]} for key, value in data_list.items()]
        # sort data_list by count
        data_list = sorted(data_list, key=lambda x: x["count"], reverse=True)
    # ---
    print("______________")
    print(f"len of {x} : {len(data_list)}.")
    # ---
    # continue
    # ---
    '''
    # حفظ القاموس المصحح في ملف JSON
    with open(f"{x}.json", "w", encoding="utf-8") as f:
        json.dump(data_list, f, ensure_ascii=False, indent=4)
    # ---
    print(f"file: {x} uploaded successfully!")
    # ---
    upload_file(
        path_or_fileobj=f"{x}.json",  # اسم الملف الذي تم حفظه
        path_in_repo=f"{x}.json",  # المسار داخل المستودع
        repo_id="Ibrahemqasim/enwiki_to_arwiki_categories",  # معرف المستودع
        # repo_type="dataset",  # نوع المستودع (نستخدم dataset للملفات)
    )
    '''
    # ---
    print("____________________________")
    # ---
    # إنشاء Dataset
    dataset = Dataset.from_list(data_list)

    # رفع Dataset إلى Hugging Face
    dataset.push_to_hub(f"Ibrahemqasim/{x}")
    # ---
    print(f"dataset: Ibrahemqasim/{x} push_to_hub successfully!")