Ibrahemqasim commited on
Commit
c10d5df
·
verified ·
1 Parent(s): d248431

Update nat_datasets.py

Browse files
Files changed (1) hide show
  1. nat_datasets.py +73 -46
nat_datasets.py CHANGED
@@ -13,82 +13,113 @@ data = load_dataset("Ibrahemqasim/categories_en2ar", split="train")
13
  # nationalities.keys() "nat_en","man","men","women","womens","country_en","country_ar",
14
 
15
  nationalities = load_dataset("Ibrahemqasim/nationalities", split="train")
16
- nationalities_pattern = r'\b(' + '|'.join(map(re.escape, [n["nat_en"] for n in sorted(nationalities, key=lambda x: -x["nat_en"].count(' '))])) + r')\b'
 
 
 
17
 
18
  countries = load_dataset("Ibrahemqasim/countries", split="train")
19
  countries_pattern = r'\b(' + '|'.join(map(re.escape, [n["en"] for n in sorted(countries, key=lambda x: -x["en"].count(' '))])) + r')\b'
20
 
21
  # ---
22
  countries_dict = {cc["en"]: cc for cc in countries}
23
- nationalities_dict = {cc["nat_en"]: cc for cc in nationalities}
24
  # ---
25
  to_work = [
26
  "categories_with_nationalities",
27
- # "categories_with_years",
28
- # "categories_with_YEAR_COUNTRY_pattern",
29
- # "categories_with_YEAR_pattern",
30
  ]
31
 
32
  data_lists = {
33
  "categories_with_nationalities" : {},
34
- "categories_with_years" : {},
35
- "categories_with_YEAR_COUNTRY_pattern" : {},
36
- "categories_with_YEAR_pattern" : {},
37
  }
38
 
39
  YEAR_PATTERN = "{YEAR}"
40
  NAT = "{NAT}"
41
  AR_NAT_MEN = "{NAT_MEN}"
 
42
  EN_NAT_PATTERN = "{EN_NAT}"
43
 
44
  COUNTRY_PATTERN = "{COUNTRY}"
45
 
 
 
46
  for tab in tqdm.tqdm(data):
47
  # ---
48
  key = tab["en"]
49
  value = tab["ar"]
50
  # ---
51
- # Add if key and value has 4 digits and they are the same
52
- reg_year = r"(\d+[–-]\d+|\d{4})"
53
- # ---
54
- key_digits = re.search(reg_year, key, re.IGNORECASE)
55
- value_digits = re.search(reg_year, value, re.IGNORECASE)
56
- # ----
57
  match1 = re.search(nationalities_pattern, key, re.IGNORECASE)
 
58
  # ----
59
- if match1:
60
- en_country = match1.group(1)
61
- ar_country = nationalities_dict.get(en_country, {}).get("men", "")
62
  # ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  if ar_country and ar_country in value:
64
- key1 = re.sub(rf'\b{re.escape(en_country)}\b', EN_NAT_PATTERN, key)
65
- value1 = re.sub(rf'\b{re.escape(ar_country)}\b', AR_NAT_MEN, value)
66
- # ---
67
- if EN_NAT_PATTERN in key1 and AR_NAT_MEN in value1:
68
- # ---
69
- if key1 in data_lists["categories_with_nationalities"]:
70
- data_lists["categories_with_nationalities"][key1]["count"] += 1
71
- else:
72
- data_lists["categories_with_nationalities"][key1] = {"ar": value1, "count": 1}
73
  # ---
74
- continue
 
 
 
 
 
 
 
 
75
  # ---
 
 
 
76
  if key_digits and value_digits and key_digits.group() == value_digits.group():
77
- # data_lists["categories_with_years"].append({"en": key, "ar": value, "count": 1})
78
- if key in data_lists["categories_with_years"]:
79
- data_lists["categories_with_years"][key]["count"] += 1
80
- else:
81
- data_lists["categories_with_years"][key] = {"ar": value, "count": 1}
82
- # ---
83
- key2 = key.replace(key_digits.group(), YEAR_PATTERN)
84
- value2 = value.replace(value_digits.group(), YEAR_PATTERN)
85
  # ---
86
- # data_lists["categories_with_YEAR_pattern"].append({"en": key2, "ar": value2, "count": 1})
 
87
  # ---
88
- if key2 in data_lists["categories_with_YEAR_pattern"]:
89
- data_lists["categories_with_YEAR_pattern"][key2]["count"] += 1
90
  else:
91
- data_lists["categories_with_YEAR_pattern"][key2] = {"ar": value2, "count": 1}
 
 
92
  # ----
93
  # البحث عن اسم الدولة في key2
94
  match = re.search(countries_pattern, key2, re.IGNORECASE)
@@ -98,8 +129,8 @@ for tab in tqdm.tqdm(data):
98
  ar_country = countries.get(en_country)
99
  # ---
100
  if ar_country and ar_country in value2:
101
- key3 = re.sub(rf'\b{re.escape(en_country)}\b', COUNTRY_PATTERN, key2)
102
- value3 = re.sub(rf'\b{re.escape(ar_country)}\b', COUNTRY_PATTERN, value2)
103
  # ---
104
  if COUNTRY_PATTERN in key3 and COUNTRY_PATTERN in value3:
105
  # ---
@@ -107,12 +138,8 @@ for tab in tqdm.tqdm(data):
107
  data_lists["categories_with_YEAR_COUNTRY_pattern"][key3]["count"] += 1
108
  else:
109
  data_lists["categories_with_YEAR_COUNTRY_pattern"][key3] = {"ar": value3, "count": 1}
110
- # ----
111
  # ----
112
- print(f"{len(data_lists['categories_with_YEAR_COUNTRY_pattern'])=}")
113
- print(f"{len(data_lists['categories_with_YEAR_pattern'])=}")
114
-
115
- print(f"all data len: {len(data):,}.")
116
 
117
  # for x, data_list in data_lists.items():
118
  for x in to_work:
 
13
  # nationalities.keys() "nat_en","man","men","women","womens","country_en","country_ar",
14
 
15
  nationalities = load_dataset("Ibrahemqasim/nationalities", split="train")
16
+ nationalities_pattern = r'\b(' + '|'.join(map(re.escape, [n["nat_en"].lower() for n in sorted(nationalities, key=lambda x: -x["nat_en"].count(' '))])) + r')\b'
17
+ nationalities_pattern_ar = r'\b(' + '|'.join(map(re.escape, [n["men"].lower() for n in sorted(nationalities, key=lambda x: -x["men"].count(' '))])) + r')\b'
18
+
19
+ # print(nationalities_pattern)
20
 
21
  countries = load_dataset("Ibrahemqasim/countries", split="train")
22
  countries_pattern = r'\b(' + '|'.join(map(re.escape, [n["en"] for n in sorted(countries, key=lambda x: -x["en"].count(' '))])) + r')\b'
23
 
24
  # ---
25
  countries_dict = {cc["en"]: cc for cc in countries}
26
+ nationalities_dict = {cc["nat_en"].lower(): cc for cc in nationalities}
27
  # ---
28
  to_work = [
29
  "categories_with_nationalities",
30
+ "categories_with_NAT_pattern",
31
+ "categories_with_YEAR_NAT_pattern",
 
32
  ]
33
 
34
  data_lists = {
35
  "categories_with_nationalities" : {},
36
+ "categories_with_NAT_pattern" : {},
37
+ "categories_with_YEAR_NAT_pattern" : {},
 
38
  }
39
 
40
  YEAR_PATTERN = "{YEAR}"
41
  NAT = "{NAT}"
42
  AR_NAT_MEN = "{NAT_MEN}"
43
+ AR_NAT_WOMENS = "{NAT_WOMENS}"
44
  EN_NAT_PATTERN = "{EN_NAT}"
45
 
46
  COUNTRY_PATTERN = "{COUNTRY}"
47
 
48
+ # data = [{"en": "Category:1970s yemeni peoples", "ar": "تصنيف: يمنيون في عقد 1970"}]
49
+ match1_done = 0
50
  for tab in tqdm.tqdm(data):
51
  # ---
52
  key = tab["en"]
53
  value = tab["ar"]
54
  # ---
 
 
 
 
 
 
55
  match1 = re.search(nationalities_pattern, key, re.IGNORECASE)
56
+ match2 = re.search(nationalities_pattern_ar, value, re.IGNORECASE)
57
  # ----
58
+ if match1 or match2:
59
+ # ---
60
+ match1_done += 1
61
  # ---
62
+ if key in data_lists["categories_with_nationalities"]:
63
+ data_lists["categories_with_nationalities"][key]["count"] += 1
64
+ else:
65
+ data_lists["categories_with_nationalities"][key] = {"ar": value, "count": 1}
66
+ # ---
67
+ if not match1:
68
+ continue
69
+ # ---
70
+ en_country = match1.group(1)
71
+ ar_tab = nationalities_dict.get(en_country.lower(), {})
72
+ # ---
73
+ if not ar_tab:
74
+ continue
75
+ # ---
76
+ ar_country = ar_tab.get("men", "")
77
+ # ---
78
+ NAT_PATTERN = ""
79
+ # ---
80
+ if ar_country and ar_country in value:
81
+ NAT_PATTERN = AR_NAT_MEN
82
+ else:
83
+ ar_country = ar_tab.get("womens", "")
84
  if ar_country and ar_country in value:
85
+ NAT_PATTERN = AR_NAT_WOMENS
86
+ # ---
87
+ if not NAT_PATTERN:
88
+ continue
89
+ # ---
90
+ key1 = re.sub(rf'\b{re.escape(en_country)}\b', EN_NAT_PATTERN, key, re.IGNORECASE)
91
+ value1 = re.sub(rf'\b{re.escape(ar_country)}\b', NAT_PATTERN, value, re.IGNORECASE)
92
+ # ---
93
+ # if EN_NAT_PATTERN in key1 and NAT_PATTERN in value1:
94
  # ---
95
+ if key1 in data_lists["categories_with_NAT_pattern"]:
96
+ data_lists["categories_with_NAT_pattern"][key1]["count"] += 1
97
+ else:
98
+ data_lists["categories_with_NAT_pattern"][key1] = {"ar": value1, "count": 1}
99
+ # ---
100
+ # continue
101
+ # ---
102
+ # Add if key and value has 4 digits and they are the same
103
+ reg_year = r"(\d+[–-]\d+|\d{4})"
104
  # ---
105
+ key_digits = re.search(reg_year, key, re.IGNORECASE)
106
+ value_digits = re.search(reg_year, value1, re.IGNORECASE)
107
+ # ----
108
  if key_digits and value_digits and key_digits.group() == value_digits.group():
109
+ # if key1 in data_lists["categories_with_years"]:
110
+ # data_lists["categories_with_years"][key1]["count"] += 1
111
+ # else:
112
+ # data_lists["categories_with_years"][key1] = {"ar": value1, "count": 1}
 
 
 
 
113
  # ---
114
+ key2 = key1.replace(key_digits.group(), YEAR_PATTERN)
115
+ value2 = value1.replace(value_digits.group(), YEAR_PATTERN)
116
  # ---
117
+ if key2 in data_lists["categories_with_YEAR_NAT_pattern"]:
118
+ data_lists["categories_with_YEAR_NAT_pattern"][key2]["count"] += 1
119
  else:
120
+ data_lists["categories_with_YEAR_NAT_pattern"][key2] = {"ar": value2, "count": 1}
121
+ # ----
122
+ continue
123
  # ----
124
  # البحث عن اسم الدولة في key2
125
  match = re.search(countries_pattern, key2, re.IGNORECASE)
 
129
  ar_country = countries.get(en_country)
130
  # ---
131
  if ar_country and ar_country in value2:
132
+ key3 = re.sub(rf'\b{re.escape(en_country)}\b', COUNTRY_PATTERN, key2, re.IGNORECASE)
133
+ value3 = re.sub(rf'\b{re.escape(ar_country)}\b', COUNTRY_PATTERN, value2, re.IGNORECASE)
134
  # ---
135
  if COUNTRY_PATTERN in key3 and COUNTRY_PATTERN in value3:
136
  # ---
 
138
  data_lists["categories_with_YEAR_COUNTRY_pattern"][key3]["count"] += 1
139
  else:
140
  data_lists["categories_with_YEAR_COUNTRY_pattern"][key3] = {"ar": value3, "count": 1}
 
141
  # ----
142
+ print(f"{match1_done=}")
 
 
 
143
 
144
  # for x, data_list in data_lists.items():
145
  for x in to_work: