Ibrahemqasim commited on
Commit
07a48c1
·
verified ·
1 Parent(s): 1c94de7

Update nat_datasets.py

Browse files
Files changed (1) hide show
  1. nat_datasets.py +43 -43
nat_datasets.py CHANGED
@@ -65,7 +65,7 @@ def new_func(value, ar_tab):
65
  country2 = f"ال{country}".replace(" ", " ال")
66
  # ---
67
  if country2 in value:
68
- return country2, tag
69
  elif country in value:
70
  return country, tag
71
 
@@ -109,54 +109,54 @@ for tab in tqdm.tqdm(data):
109
  value1 = re.sub(rf'\b{re.escape(ar_country)}\b', NAT_PATTERN, f" {value} ", re.IGNORECASE)
110
  value1 = value1.strip()
111
  # ---
112
- # if EN_NAT_PATTERN in key1 and NAT_PATTERN in value1:
113
- # ---
114
- if key1 in data_lists["categories_with_NAT_pattern"]:
115
- data_lists["categories_with_NAT_pattern"][key1]["count"] += 1
116
- else:
117
- data_lists["categories_with_NAT_pattern"][key1] = {"ar": value1, "count": 1}
118
- # ---
119
- # continue
120
- # ---
121
- # Add if key and value has 4 digits and they are the same
122
- reg_year = r"(\d+[–-]\d+|\d{4})"
123
- # ---
124
- key_digits = re.search(reg_year, key, re.IGNORECASE)
125
- value_digits = re.search(reg_year, value1, re.IGNORECASE)
126
- # ----
127
- if key_digits and value_digits and key_digits.group() == value_digits.group():
128
- # if key1 in data_lists["categories_with_years"]:
129
- # data_lists["categories_with_years"][key1]["count"] += 1
130
- # else:
131
- # data_lists["categories_with_years"][key1] = {"ar": value1, "count": 1}
132
  # ---
133
- key2 = key1.replace(key_digits.group(), YEAR_PATTERN)
134
- value2 = value1.replace(value_digits.group(), YEAR_PATTERN)
135
- # ---
136
- if key2 in data_lists["categories_with_YEAR_NAT_pattern"]:
137
- data_lists["categories_with_YEAR_NAT_pattern"][key2]["count"] += 1
138
  else:
139
- data_lists["categories_with_YEAR_NAT_pattern"][key2] = {"ar": value2, "count": 1}
140
- # ----
141
- continue
142
- # ----
143
- # البحث عن اسم الدولة في key2
144
- match = re.search(countries_pattern, key2, re.IGNORECASE)
 
 
 
145
  # ----
146
- if match:
147
- en_country = match.group(1)
148
- ar_country = countries.get(en_country)
 
 
 
 
 
149
  # ---
150
- if ar_country and ar_country in value2:
151
- key3 = re.sub(rf'\b{re.escape(en_country)}\b', COUNTRY_PATTERN, key2, re.IGNORECASE)
152
- value3 = re.sub(rf'\b{re.escape(ar_country)}\b', COUNTRY_PATTERN, value2, re.IGNORECASE)
 
 
 
 
 
 
 
 
 
 
153
  # ---
154
- if COUNTRY_PATTERN in key3 and COUNTRY_PATTERN in value3:
 
 
155
  # ---
156
- if key3 in data_lists["categories_with_YEAR_COUNTRY_pattern"]:
157
- data_lists["categories_with_YEAR_COUNTRY_pattern"][key3]["count"] += 1
158
- else:
159
- data_lists["categories_with_YEAR_COUNTRY_pattern"][key3] = {"ar": value3, "count": 1}
 
 
160
  # ----
161
  print(f"{match1_done=}")
162
 
 
65
  country2 = f"ال{country}".replace(" ", " ال")
66
  # ---
67
  if country2 in value:
68
+ return country2, tag.replace("}", "_AL}")
69
  elif country in value:
70
  return country, tag
71
 
 
109
  value1 = re.sub(rf'\b{re.escape(ar_country)}\b', NAT_PATTERN, f" {value} ", re.IGNORECASE)
110
  value1 = value1.strip()
111
  # ---
112
+ if EN_NAT_PATTERN in key1 and NAT_PATTERN in value1:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  # ---
114
+ if key1 in data_lists["categories_with_NAT_pattern"]:
115
+ data_lists["categories_with_NAT_pattern"][key1]["count"] += 1
 
 
 
116
  else:
117
+ data_lists["categories_with_NAT_pattern"][key1] = {"ar": value1, "count": 1}
118
+ # ---
119
+ # continue
120
+ # ---
121
+ # Add if key and value has 4 digits and they are the same
122
+ reg_year = r"(\d+[–-]\d+|\d{4})"
123
+ # ---
124
+ key_digits = re.search(reg_year, key, re.IGNORECASE)
125
+ value_digits = re.search(reg_year, value1, re.IGNORECASE)
126
  # ----
127
+ if key_digits and value_digits and key_digits.group() == value_digits.group():
128
+ # if key1 in data_lists["categories_with_years"]:
129
+ # data_lists["categories_with_years"][key1]["count"] += 1
130
+ # else:
131
+ # data_lists["categories_with_years"][key1] = {"ar": value1, "count": 1}
132
+ # ---
133
+ key2 = key1.replace(key_digits.group(), YEAR_PATTERN)
134
+ value2 = value1.replace(value_digits.group(), YEAR_PATTERN)
135
  # ---
136
+ if key2 in data_lists["categories_with_YEAR_NAT_pattern"]:
137
+ data_lists["categories_with_YEAR_NAT_pattern"][key2]["count"] += 1
138
+ else:
139
+ data_lists["categories_with_YEAR_NAT_pattern"][key2] = {"ar": value2, "count": 1}
140
+ # ----
141
+ continue
142
+ # ----
143
+ # البحث عن اسم الدولة في key2
144
+ match = re.search(countries_pattern, key2, re.IGNORECASE)
145
+ # ----
146
+ if match:
147
+ en_country = match.group(1)
148
+ ar_country = countries.get(en_country)
149
  # ---
150
+ if ar_country and ar_country in value2:
151
+ key3 = re.sub(rf'\b{re.escape(en_country)}\b', COUNTRY_PATTERN, key2, re.IGNORECASE)
152
+ value3 = re.sub(rf'\b{re.escape(ar_country)}\b', COUNTRY_PATTERN, value2, re.IGNORECASE)
153
  # ---
154
+ if COUNTRY_PATTERN in key3 and COUNTRY_PATTERN in value3:
155
+ # ---
156
+ if key3 in data_lists["categories_with_YEAR_COUNTRY_pattern"]:
157
+ data_lists["categories_with_YEAR_COUNTRY_pattern"][key3]["count"] += 1
158
+ else:
159
+ data_lists["categories_with_YEAR_COUNTRY_pattern"][key3] = {"ar": value3, "count": 1}
160
  # ----
161
  print(f"{match1_done=}")
162