Update nat_datasets.py
Browse files- nat_datasets.py +34 -18
nat_datasets.py
CHANGED
@@ -14,7 +14,7 @@ data = load_dataset("Ibrahemqasim/categories_en2ar", split="train")
|
|
14 |
|
15 |
nationalities = load_dataset("Ibrahemqasim/nationalities", split="train")
|
16 |
nationalities_pattern = r'\b(' + '|'.join(map(re.escape, [n["nat_en"].lower() for n in sorted(nationalities, key=lambda x: -x["nat_en"].count(' '))])) + r')\b'
|
17 |
-
nationalities_pattern_ar = r'
|
18 |
|
19 |
# print(nationalities_pattern)
|
20 |
|
@@ -39,23 +39,48 @@ data_lists = {
|
|
39 |
|
40 |
YEAR_PATTERN = "{YEAR}"
|
41 |
NAT = "{NAT}"
|
42 |
-
AR_NAT_MEN = "{NAT_MEN}"
|
43 |
-
AR_NAT_WOMENS = "{NAT_WOMENS}"
|
44 |
EN_NAT_PATTERN = "{EN_NAT}"
|
45 |
|
46 |
COUNTRY_PATTERN = "{COUNTRY}"
|
47 |
|
48 |
# data = [{"en": "Category:1970s yemeni peoples", "ar": "ุชุตููู: ูู
ูููู ูู ุนูุฏ 1970"}]
|
49 |
match1_done = 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
for tab in tqdm.tqdm(data):
|
51 |
# ---
|
52 |
key = tab["en"]
|
53 |
value = tab["ar"]
|
54 |
# ---
|
55 |
-
|
56 |
-
|
57 |
# ----
|
58 |
-
if
|
59 |
# ---
|
60 |
match1_done += 1
|
61 |
# ---
|
@@ -64,25 +89,16 @@ for tab in tqdm.tqdm(data):
|
|
64 |
else:
|
65 |
data_lists["categories_with_nationalities"][key] = {"ar": value, "count": 1}
|
66 |
# ---
|
67 |
-
if not
|
68 |
continue
|
69 |
# ---
|
70 |
-
en_country =
|
71 |
ar_tab = nationalities_dict.get(en_country.lower(), {})
|
72 |
# ---
|
73 |
if not ar_tab:
|
74 |
continue
|
75 |
# ---
|
76 |
-
ar_country =
|
77 |
-
# ---
|
78 |
-
NAT_PATTERN = ""
|
79 |
-
# ---
|
80 |
-
if ar_country and ar_country in value:
|
81 |
-
NAT_PATTERN = AR_NAT_MEN
|
82 |
-
else:
|
83 |
-
ar_country = ar_tab.get("womens", "")
|
84 |
-
if ar_country and ar_country in value:
|
85 |
-
NAT_PATTERN = AR_NAT_WOMENS
|
86 |
# ---
|
87 |
if not NAT_PATTERN:
|
88 |
continue
|
|
|
14 |
|
15 |
nationalities = load_dataset("Ibrahemqasim/nationalities", split="train")
|
16 |
nationalities_pattern = r'\b(' + '|'.join(map(re.escape, [n["nat_en"].lower() for n in sorted(nationalities, key=lambda x: -x["nat_en"].count(' '))])) + r')\b'
|
17 |
+
nationalities_pattern_ar = r'(' + '|'.join(map(re.escape, [n["man"].lower() for n in sorted(nationalities, key=lambda x: -x["man"].count(' '))])) + r')'
|
18 |
|
19 |
# print(nationalities_pattern)
|
20 |
|
|
|
39 |
|
40 |
YEAR_PATTERN = "{YEAR}"
|
41 |
NAT = "{NAT}"
|
|
|
|
|
42 |
EN_NAT_PATTERN = "{EN_NAT}"
|
43 |
|
44 |
COUNTRY_PATTERN = "{COUNTRY}"
|
45 |
|
46 |
# data = [{"en": "Category:1970s yemeni peoples", "ar": "ุชุตููู: ูู
ูููู ูู ุนูุฏ 1970"}]
|
47 |
match1_done = 0
|
48 |
+
|
49 |
+
|
50 |
+
def new_func(value, ar_tab):
|
51 |
+
# ---
|
52 |
+
ar_country = ar_tab.get("men", "")
|
53 |
+
# ---
|
54 |
+
if ar_country and ar_country in value:
|
55 |
+
return ar_country, "{NAT_MEN}"
|
56 |
+
# ---
|
57 |
+
ar_country2 = ar_tab.get("womens", "")
|
58 |
+
# ---
|
59 |
+
if ar_country2 and ar_country2 in value:
|
60 |
+
return ar_country2, "{NAT_WOMENS}"
|
61 |
+
# ---
|
62 |
+
ar_country3 = ar_tab.get("women", "")
|
63 |
+
# ---
|
64 |
+
if ar_country3 and ar_country2 in value:
|
65 |
+
return ar_country3, "{NAT_WOMEN}"
|
66 |
+
# ---
|
67 |
+
ar_country4 = ar_tab.get("man", "")
|
68 |
+
# ---
|
69 |
+
if ar_country4 and ar_country2 in value:
|
70 |
+
return ar_country4, "{NAT_MAN}"
|
71 |
+
# ---
|
72 |
+
return "", ""
|
73 |
+
|
74 |
+
|
75 |
for tab in tqdm.tqdm(data):
|
76 |
# ---
|
77 |
key = tab["en"]
|
78 |
value = tab["ar"]
|
79 |
# ---
|
80 |
+
match_en = re.search(nationalities_pattern, key, re.IGNORECASE)
|
81 |
+
match_ar = re.search(nationalities_pattern_ar, value, re.IGNORECASE)
|
82 |
# ----
|
83 |
+
if match_en or match_ar:
|
84 |
# ---
|
85 |
match1_done += 1
|
86 |
# ---
|
|
|
89 |
else:
|
90 |
data_lists["categories_with_nationalities"][key] = {"ar": value, "count": 1}
|
91 |
# ---
|
92 |
+
if not match_en:
|
93 |
continue
|
94 |
# ---
|
95 |
+
en_country = match_en.group(1)
|
96 |
ar_tab = nationalities_dict.get(en_country.lower(), {})
|
97 |
# ---
|
98 |
if not ar_tab:
|
99 |
continue
|
100 |
# ---
|
101 |
+
ar_country, NAT_PATTERN = new_func(value, ar_tab)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
# ---
|
103 |
if not NAT_PATTERN:
|
104 |
continue
|