Ibrahemqasim commited on
Commit
10b9478
ยท
verified ยท
1 Parent(s): c10d5df

Update nat_datasets.py

Browse files
Files changed (1) hide show
  1. nat_datasets.py +34 -18
nat_datasets.py CHANGED
@@ -14,7 +14,7 @@ data = load_dataset("Ibrahemqasim/categories_en2ar", split="train")
14
 
15
  nationalities = load_dataset("Ibrahemqasim/nationalities", split="train")
16
  nationalities_pattern = r'\b(' + '|'.join(map(re.escape, [n["nat_en"].lower() for n in sorted(nationalities, key=lambda x: -x["nat_en"].count(' '))])) + r')\b'
17
- nationalities_pattern_ar = r'\b(' + '|'.join(map(re.escape, [n["men"].lower() for n in sorted(nationalities, key=lambda x: -x["men"].count(' '))])) + r')\b'
18
 
19
  # print(nationalities_pattern)
20
 
@@ -39,23 +39,48 @@ data_lists = {
39
 
40
  YEAR_PATTERN = "{YEAR}"
41
  NAT = "{NAT}"
42
- AR_NAT_MEN = "{NAT_MEN}"
43
- AR_NAT_WOMENS = "{NAT_WOMENS}"
44
  EN_NAT_PATTERN = "{EN_NAT}"
45
 
46
  COUNTRY_PATTERN = "{COUNTRY}"
47
 
48
  # data = [{"en": "Category:1970s yemeni peoples", "ar": "ุชุตู†ูŠู: ูŠู…ู†ูŠูˆู† ููŠ ุนู‚ุฏ 1970"}]
49
  match1_done = 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  for tab in tqdm.tqdm(data):
51
  # ---
52
  key = tab["en"]
53
  value = tab["ar"]
54
  # ---
55
- match1 = re.search(nationalities_pattern, key, re.IGNORECASE)
56
- match2 = re.search(nationalities_pattern_ar, value, re.IGNORECASE)
57
  # ----
58
- if match1 or match2:
59
  # ---
60
  match1_done += 1
61
  # ---
@@ -64,25 +89,16 @@ for tab in tqdm.tqdm(data):
64
  else:
65
  data_lists["categories_with_nationalities"][key] = {"ar": value, "count": 1}
66
  # ---
67
- if not match1:
68
  continue
69
  # ---
70
- en_country = match1.group(1)
71
  ar_tab = nationalities_dict.get(en_country.lower(), {})
72
  # ---
73
  if not ar_tab:
74
  continue
75
  # ---
76
- ar_country = ar_tab.get("men", "")
77
- # ---
78
- NAT_PATTERN = ""
79
- # ---
80
- if ar_country and ar_country in value:
81
- NAT_PATTERN = AR_NAT_MEN
82
- else:
83
- ar_country = ar_tab.get("womens", "")
84
- if ar_country and ar_country in value:
85
- NAT_PATTERN = AR_NAT_WOMENS
86
  # ---
87
  if not NAT_PATTERN:
88
  continue
 
14
 
15
  nationalities = load_dataset("Ibrahemqasim/nationalities", split="train")
16
  nationalities_pattern = r'\b(' + '|'.join(map(re.escape, [n["nat_en"].lower() for n in sorted(nationalities, key=lambda x: -x["nat_en"].count(' '))])) + r')\b'
17
+ nationalities_pattern_ar = r'(' + '|'.join(map(re.escape, [n["man"].lower() for n in sorted(nationalities, key=lambda x: -x["man"].count(' '))])) + r')'
18
 
19
  # print(nationalities_pattern)
20
 
 
39
 
40
  YEAR_PATTERN = "{YEAR}"
41
  NAT = "{NAT}"
 
 
42
  EN_NAT_PATTERN = "{EN_NAT}"
43
 
44
  COUNTRY_PATTERN = "{COUNTRY}"
45
 
46
  # data = [{"en": "Category:1970s yemeni peoples", "ar": "ุชุตู†ูŠู: ูŠู…ู†ูŠูˆู† ููŠ ุนู‚ุฏ 1970"}]
47
  match1_done = 0
48
+
49
+
50
+ def new_func(value, ar_tab):
51
+ # ---
52
+ ar_country = ar_tab.get("men", "")
53
+ # ---
54
+ if ar_country and ar_country in value:
55
+ return ar_country, "{NAT_MEN}"
56
+ # ---
57
+ ar_country2 = ar_tab.get("womens", "")
58
+ # ---
59
+ if ar_country2 and ar_country2 in value:
60
+ return ar_country2, "{NAT_WOMENS}"
61
+ # ---
62
+ ar_country3 = ar_tab.get("women", "")
63
+ # ---
64
+ if ar_country3 and ar_country2 in value:
65
+ return ar_country3, "{NAT_WOMEN}"
66
+ # ---
67
+ ar_country4 = ar_tab.get("man", "")
68
+ # ---
69
+ if ar_country4 and ar_country2 in value:
70
+ return ar_country4, "{NAT_MAN}"
71
+ # ---
72
+ return "", ""
73
+
74
+
75
  for tab in tqdm.tqdm(data):
76
  # ---
77
  key = tab["en"]
78
  value = tab["ar"]
79
  # ---
80
+ match_en = re.search(nationalities_pattern, key, re.IGNORECASE)
81
+ match_ar = re.search(nationalities_pattern_ar, value, re.IGNORECASE)
82
  # ----
83
+ if match_en or match_ar:
84
  # ---
85
  match1_done += 1
86
  # ---
 
89
  else:
90
  data_lists["categories_with_nationalities"][key] = {"ar": value, "count": 1}
91
  # ---
92
+ if not match_en:
93
  continue
94
  # ---
95
+ en_country = match_en.group(1)
96
  ar_tab = nationalities_dict.get(en_country.lower(), {})
97
  # ---
98
  if not ar_tab:
99
  continue
100
  # ---
101
+ ar_country, NAT_PATTERN = new_func(value, ar_tab)
 
 
 
 
 
 
 
 
 
102
  # ---
103
  if not NAT_PATTERN:
104
  continue