Ibrahemqasim commited on
Commit
d248431
·
verified ·
1 Parent(s): 805087b

Update nat_datasets.py

Browse files
Files changed (1) hide show
  1. nat_datasets.py +12 -6
nat_datasets.py CHANGED
@@ -11,6 +11,7 @@ login(userdata.get('HF_TOKEN'))
11
  data = load_dataset("Ibrahemqasim/categories_en2ar", split="train")
12
 
13
  # nationalities.keys() "nat_en","man","men","women","womens","country_en","country_ar",
 
14
  nationalities = load_dataset("Ibrahemqasim/nationalities", split="train")
15
  nationalities_pattern = r'\b(' + '|'.join(map(re.escape, [n["nat_en"] for n in sorted(nationalities, key=lambda x: -x["nat_en"].count(' '))])) + r')\b'
16
 
@@ -38,6 +39,8 @@ data_lists = {
38
  YEAR_PATTERN = "{YEAR}"
39
  NAT = "{NAT}"
40
  AR_NAT_MEN = "{NAT_MEN}"
 
 
41
  COUNTRY_PATTERN = "{COUNTRY}"
42
 
43
  for tab in tqdm.tqdm(data):
@@ -48,20 +51,20 @@ for tab in tqdm.tqdm(data):
48
  # Add if key and value has 4 digits and they are the same
49
  reg_year = r"(\d+[–-]\d+|\d{4})"
50
  # ---
51
- key_digits = re.search(reg_year, key)
52
- value_digits = re.search(reg_year, value)
53
  # ----
54
- match1 = re.search(nationalities_pattern, key)
55
  # ----
56
  if match1:
57
  en_country = match1.group(1)
58
  ar_country = nationalities_dict.get(en_country, {}).get("men", "")
59
  # ---
60
  if ar_country and ar_country in value:
61
- key1 = re.sub(rf'\b{re.escape(en_country)}\b', COUNTRY_PATTERN, key)
62
  value1 = re.sub(rf'\b{re.escape(ar_country)}\b', AR_NAT_MEN, value)
63
  # ---
64
- if COUNTRY_PATTERN in key1 and AR_NAT_MEN in value1:
65
  # ---
66
  if key1 in data_lists["categories_with_nationalities"]:
67
  data_lists["categories_with_nationalities"][key1]["count"] += 1
@@ -88,7 +91,7 @@ for tab in tqdm.tqdm(data):
88
  data_lists["categories_with_YEAR_pattern"][key2] = {"ar": value2, "count": 1}
89
  # ----
90
  # البحث عن اسم الدولة في key2
91
- match = re.search(countries_pattern, key2)
92
  # ----
93
  if match:
94
  en_country = match.group(1)
@@ -127,6 +130,9 @@ for x in to_work:
127
  # ---
128
  print("____________________________")
129
  # ---
 
 
 
130
  # إنشاء Dataset
131
  dataset = Dataset.from_list(data_list)
132
 
 
11
  data = load_dataset("Ibrahemqasim/categories_en2ar", split="train")
12
 
13
  # nationalities.keys() "nat_en","man","men","women","womens","country_en","country_ar",
14
+
15
  nationalities = load_dataset("Ibrahemqasim/nationalities", split="train")
16
  nationalities_pattern = r'\b(' + '|'.join(map(re.escape, [n["nat_en"] for n in sorted(nationalities, key=lambda x: -x["nat_en"].count(' '))])) + r')\b'
17
 
 
39
  YEAR_PATTERN = "{YEAR}"
40
  NAT = "{NAT}"
41
  AR_NAT_MEN = "{NAT_MEN}"
42
+ EN_NAT_PATTERN = "{EN_NAT}"
43
+
44
  COUNTRY_PATTERN = "{COUNTRY}"
45
 
46
  for tab in tqdm.tqdm(data):
 
51
  # Add if key and value has 4 digits and they are the same
52
  reg_year = r"(\d+[–-]\d+|\d{4})"
53
  # ---
54
+ key_digits = re.search(reg_year, key, re.IGNORECASE)
55
+ value_digits = re.search(reg_year, value, re.IGNORECASE)
56
  # ----
57
+ match1 = re.search(nationalities_pattern, key, re.IGNORECASE)
58
  # ----
59
  if match1:
60
  en_country = match1.group(1)
61
  ar_country = nationalities_dict.get(en_country, {}).get("men", "")
62
  # ---
63
  if ar_country and ar_country in value:
64
+ key1 = re.sub(rf'\b{re.escape(en_country)}\b', EN_NAT_PATTERN, key)
65
  value1 = re.sub(rf'\b{re.escape(ar_country)}\b', AR_NAT_MEN, value)
66
  # ---
67
+ if EN_NAT_PATTERN in key1 and AR_NAT_MEN in value1:
68
  # ---
69
  if key1 in data_lists["categories_with_nationalities"]:
70
  data_lists["categories_with_nationalities"][key1]["count"] += 1
 
91
  data_lists["categories_with_YEAR_pattern"][key2] = {"ar": value2, "count": 1}
92
  # ----
93
  # البحث عن اسم الدولة في key2
94
+ match = re.search(countries_pattern, key2, re.IGNORECASE)
95
  # ----
96
  if match:
97
  en_country = match.group(1)
 
130
  # ---
131
  print("____________________________")
132
  # ---
133
+ if len(data_list) == 0:
134
+ continue
135
+ # ---
136
  # إنشاء Dataset
137
  dataset = Dataset.from_list(data_list)
138