File size: 5,337 Bytes
805087b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import tqdm
import re
import requests
from huggingface_hub import login
from datasets import Dataset
from datasets import load_dataset
from google.colab import userdata

login(userdata.get('HF_TOKEN'))

data = load_dataset("Ibrahemqasim/categories_en2ar", split="train")

# nationalities.keys() "nat_en","man","men","women","womens","country_en","country_ar",
nationalities = load_dataset("Ibrahemqasim/nationalities", split="train")
nationalities_pattern = r'\b(' + '|'.join(map(re.escape, [n["nat_en"] for n in sorted(nationalities, key=lambda x: -x["nat_en"].count(' '))])) + r')\b'

countries = load_dataset("Ibrahemqasim/countries", split="train")
countries_pattern = r'\b(' + '|'.join(map(re.escape, [n["en"] for n in sorted(countries, key=lambda x: -x["en"].count(' '))])) + r')\b'

# ---
countries_dict = {cc["en"]: cc for cc in countries}
nationalities_dict = {cc["nat_en"]: cc for cc in nationalities}
# ---
to_work = [
    "categories_with_nationalities",
    # "categories_with_years",
    # "categories_with_YEAR_COUNTRY_pattern",
    # "categories_with_YEAR_pattern",
]

data_lists = {
    "categories_with_nationalities" : {},
    "categories_with_years" : {},
    "categories_with_YEAR_COUNTRY_pattern" : {},
    "categories_with_YEAR_pattern" : {},
}

YEAR_PATTERN = "{YEAR}"
NAT = "{NAT}"
AR_NAT_MEN = "{NAT_MEN}"
COUNTRY_PATTERN = "{COUNTRY}"

for tab in tqdm.tqdm(data):
    # ---
    key = tab["en"]
    value = tab["ar"]
    # ---
    # Add if key and value has 4 digits and they are the same
    reg_year = r"(\d+[–-]\d+|\d{4})"
    # ---
    key_digits = re.search(reg_year, key)
    value_digits = re.search(reg_year, value)
    # ----
    match1 = re.search(nationalities_pattern, key)
    # ----
    if match1:
        en_country = match1.group(1)
        ar_country = nationalities_dict.get(en_country, {}).get("men", "")
        # ---
        if ar_country and ar_country in value:
            key1 = re.sub(rf'\b{re.escape(en_country)}\b', COUNTRY_PATTERN, key)
            value1 = re.sub(rf'\b{re.escape(ar_country)}\b', AR_NAT_MEN, value)
            # ---
            if COUNTRY_PATTERN in key1 and AR_NAT_MEN in value1:
                # ---
                if key1 in data_lists["categories_with_nationalities"]:
                    data_lists["categories_with_nationalities"][key1]["count"] += 1
                else:
                    data_lists["categories_with_nationalities"][key1] = {"ar": value1, "count": 1}
    # ---
    continue
    # ---
    if key_digits and value_digits and key_digits.group() == value_digits.group():
        # data_lists["categories_with_years"].append({"en": key, "ar": value, "count": 1})
        if key in data_lists["categories_with_years"]:
            data_lists["categories_with_years"][key]["count"] += 1
        else:
            data_lists["categories_with_years"][key] = {"ar": value, "count": 1}
        # ---
        key2 = key.replace(key_digits.group(), YEAR_PATTERN)
        value2 = value.replace(value_digits.group(), YEAR_PATTERN)
        # ---
        # data_lists["categories_with_YEAR_pattern"].append({"en": key2, "ar": value2, "count": 1})
        # ---
        if key2 in data_lists["categories_with_YEAR_pattern"]:
            data_lists["categories_with_YEAR_pattern"][key2]["count"] += 1
        else:
            data_lists["categories_with_YEAR_pattern"][key2] = {"ar": value2, "count": 1}
        # ----
        # البحث عن اسم الدولة في key2
        match = re.search(countries_pattern, key2)
        # ----
        if match:
            en_country = match.group(1)
            ar_country = countries.get(en_country)
            # ---
            if ar_country and ar_country in value2:
                key3 = re.sub(rf'\b{re.escape(en_country)}\b', COUNTRY_PATTERN, key2)
                value3 = re.sub(rf'\b{re.escape(ar_country)}\b', COUNTRY_PATTERN, value2)
                # ---
                if COUNTRY_PATTERN in key3 and COUNTRY_PATTERN in value3:
                    # ---
                    if key3 in data_lists["categories_with_YEAR_COUNTRY_pattern"]:
                        data_lists["categories_with_YEAR_COUNTRY_pattern"][key3]["count"] += 1
                    else:
                        data_lists["categories_with_YEAR_COUNTRY_pattern"][key3] = {"ar": value3, "count": 1}
        # ----
# ----
print(f"{len(data_lists['categories_with_YEAR_COUNTRY_pattern'])=}")
print(f"{len(data_lists['categories_with_YEAR_pattern'])=}")

print(f"all data len: {len(data):,}.")

# for x, data_list in data_lists.items():
for x in to_work:
    data_list = data_lists.get(x)
    # ---
    if x == "countries":
        data_list = [{"en": key, "ar": value} for key, value in data_list.items()]
    else:
        data_list = [{"en": key, "ar": value["ar"], "count": value["count"]} for key, value in data_list.items()]
        # sort data_list by count
        data_list = sorted(data_list, key=lambda x: x["count"], reverse=True)
    # ---
    print("______________")
    print(f"len of {x} : {len(data_list)}.")
    # ---
    print("____________________________")
    # ---
    # إنشاء Dataset
    dataset = Dataset.from_list(data_list)

    # رفع Dataset إلى Hugging Face
    dataset.push_to_hub(f"Ibrahemqasim/{x}")
    # ---
    print(f"dataset: Ibrahemqasim/{x} push_to_hub successfully!")