Update nat_datasets.py
Browse files- nat_datasets.py +12 -6
nat_datasets.py
CHANGED
@@ -11,6 +11,7 @@ login(userdata.get('HF_TOKEN'))
|
|
11 |
data = load_dataset("Ibrahemqasim/categories_en2ar", split="train")
|
12 |
|
13 |
# nationalities.keys() "nat_en","man","men","women","womens","country_en","country_ar",
|
|
|
14 |
nationalities = load_dataset("Ibrahemqasim/nationalities", split="train")
|
15 |
nationalities_pattern = r'\b(' + '|'.join(map(re.escape, [n["nat_en"] for n in sorted(nationalities, key=lambda x: -x["nat_en"].count(' '))])) + r')\b'
|
16 |
|
@@ -38,6 +39,8 @@ data_lists = {
|
|
38 |
YEAR_PATTERN = "{YEAR}"
|
39 |
NAT = "{NAT}"
|
40 |
AR_NAT_MEN = "{NAT_MEN}"
|
|
|
|
|
41 |
COUNTRY_PATTERN = "{COUNTRY}"
|
42 |
|
43 |
for tab in tqdm.tqdm(data):
|
@@ -48,20 +51,20 @@ for tab in tqdm.tqdm(data):
|
|
48 |
# Add if key and value has 4 digits and they are the same
|
49 |
reg_year = r"(\d+[–-]\d+|\d{4})"
|
50 |
# ---
|
51 |
-
key_digits = re.search(reg_year, key)
|
52 |
-
value_digits = re.search(reg_year, value)
|
53 |
# ----
|
54 |
-
match1 = re.search(nationalities_pattern, key)
|
55 |
# ----
|
56 |
if match1:
|
57 |
en_country = match1.group(1)
|
58 |
ar_country = nationalities_dict.get(en_country, {}).get("men", "")
|
59 |
# ---
|
60 |
if ar_country and ar_country in value:
|
61 |
-
key1 = re.sub(rf'\b{re.escape(en_country)}\b',
|
62 |
value1 = re.sub(rf'\b{re.escape(ar_country)}\b', AR_NAT_MEN, value)
|
63 |
# ---
|
64 |
-
if
|
65 |
# ---
|
66 |
if key1 in data_lists["categories_with_nationalities"]:
|
67 |
data_lists["categories_with_nationalities"][key1]["count"] += 1
|
@@ -88,7 +91,7 @@ for tab in tqdm.tqdm(data):
|
|
88 |
data_lists["categories_with_YEAR_pattern"][key2] = {"ar": value2, "count": 1}
|
89 |
# ----
|
90 |
# البحث عن اسم الدولة في key2
|
91 |
-
match = re.search(countries_pattern, key2)
|
92 |
# ----
|
93 |
if match:
|
94 |
en_country = match.group(1)
|
@@ -127,6 +130,9 @@ for x in to_work:
|
|
127 |
# ---
|
128 |
print("____________________________")
|
129 |
# ---
|
|
|
|
|
|
|
130 |
# إنشاء Dataset
|
131 |
dataset = Dataset.from_list(data_list)
|
132 |
|
|
|
11 |
data = load_dataset("Ibrahemqasim/categories_en2ar", split="train")
|
12 |
|
13 |
# nationalities.keys() "nat_en","man","men","women","womens","country_en","country_ar",
|
14 |
+
|
15 |
nationalities = load_dataset("Ibrahemqasim/nationalities", split="train")
|
16 |
nationalities_pattern = r'\b(' + '|'.join(map(re.escape, [n["nat_en"] for n in sorted(nationalities, key=lambda x: -x["nat_en"].count(' '))])) + r')\b'
|
17 |
|
|
|
39 |
YEAR_PATTERN = "{YEAR}"
|
40 |
NAT = "{NAT}"
|
41 |
AR_NAT_MEN = "{NAT_MEN}"
|
42 |
+
EN_NAT_PATTERN = "{EN_NAT}"
|
43 |
+
|
44 |
COUNTRY_PATTERN = "{COUNTRY}"
|
45 |
|
46 |
for tab in tqdm.tqdm(data):
|
|
|
51 |
# Add if key and value has 4 digits and they are the same
|
52 |
reg_year = r"(\d+[–-]\d+|\d{4})"
|
53 |
# ---
|
54 |
+
key_digits = re.search(reg_year, key, re.IGNORECASE)
|
55 |
+
value_digits = re.search(reg_year, value, re.IGNORECASE)
|
56 |
# ----
|
57 |
+
match1 = re.search(nationalities_pattern, key, re.IGNORECASE)
|
58 |
# ----
|
59 |
if match1:
|
60 |
en_country = match1.group(1)
|
61 |
ar_country = nationalities_dict.get(en_country, {}).get("men", "")
|
62 |
# ---
|
63 |
if ar_country and ar_country in value:
|
64 |
+
key1 = re.sub(rf'\b{re.escape(en_country)}\b', EN_NAT_PATTERN, key)
|
65 |
value1 = re.sub(rf'\b{re.escape(ar_country)}\b', AR_NAT_MEN, value)
|
66 |
# ---
|
67 |
+
if EN_NAT_PATTERN in key1 and AR_NAT_MEN in value1:
|
68 |
# ---
|
69 |
if key1 in data_lists["categories_with_nationalities"]:
|
70 |
data_lists["categories_with_nationalities"][key1]["count"] += 1
|
|
|
91 |
data_lists["categories_with_YEAR_pattern"][key2] = {"ar": value2, "count": 1}
|
92 |
# ----
|
93 |
# البحث عن اسم الدولة في key2
|
94 |
+
match = re.search(countries_pattern, key2, re.IGNORECASE)
|
95 |
# ----
|
96 |
if match:
|
97 |
en_country = match.group(1)
|
|
|
130 |
# ---
|
131 |
print("____________________________")
|
132 |
# ---
|
133 |
+
if len(data_list) == 0:
|
134 |
+
continue
|
135 |
+
# ---
|
136 |
# إنشاء Dataset
|
137 |
dataset = Dataset.from_list(data_list)
|
138 |
|