Ibrahemqasim commited on
Commit
b5fe413
·
verified ·
1 Parent(s): 46e3596

Update fix langlinks.py

Browse files
Files changed (1) hide show
  1. fix langlinks.py +41 -30
fix langlinks.py CHANGED
@@ -24,10 +24,10 @@ countries = response2.json()
24
  # تحويل القاموس إلى قائمة من القواميس [{ "en": "value", "ar": "value" }, ...]
25
 
26
  to_work = [
27
- # "langlinks",
28
- # "filtered_data",
29
  "cats_2000_contry",
30
- # "cats_2000",
31
  ]
32
 
33
  data_lists = {
@@ -44,6 +44,8 @@ sorted_keys = sorted(countries.keys(), key=lambda x: -x.count(' '))
44
  # نبني تعبير regex
45
  regex_pattern = r'\b(' + '|'.join(map(re.escape, sorted_keys)) + r')\b'
46
 
 
 
47
 
48
  for tab in tqdm.tqdm(data):
49
  # ---
@@ -61,22 +63,34 @@ for tab in tqdm.tqdm(data):
61
  if value.startswith(':"') and value.endswith('",'):
62
  value = value[2:-2]
63
  # ----
64
- # data_lists["langlinks"].append({"en": key, "ar": value})
65
- data_lists["langlinks"][key] = value
 
 
 
66
  # ----
67
  # Add if key and value has 4 digits and they are the same
68
- key_digits = re.search(r"\d{4}", key)
69
- value_digits = re.search(r"\d{4}", value)
 
 
70
  # ----
71
  if key_digits and value_digits and key_digits.group() == value_digits.group():
72
- # data_lists["filtered_data"].append({"en": key, "ar": value})
73
- data_lists["filtered_data"][key] = value
 
 
 
 
 
 
74
  # ---
75
- key2 = key.replace(key_digits.group(), "2000")
76
- value2 = value.replace(value_digits.group(), "2000")
77
  # ---
78
- # data_lists["cats_2000"].append({"en": key2, "ar": value2})
79
- data_lists["cats_2000"][key2] = value2
 
 
80
  # ----
81
  # البحث عن اسم الدولة في key2
82
  match = re.search(regex_pattern, key2)
@@ -86,24 +100,19 @@ for tab in tqdm.tqdm(data):
86
  ar_country = countries.get(en_country)
87
  # ---
88
  if ar_country and ar_country in value2:
89
- key3 = re.sub(rf'\b{re.escape(en_country)}\b', 'country', key2)
90
- value3 = re.sub(rf'\b{re.escape(ar_country)}\b', 'country', value2)
91
  # ---
92
- if key3 not in data_lists["cats_2000_contry"]:
93
- print(f"{key3} → {value3}")
94
  # ---
95
- data_lists["cats_2000_contry"][key3] = value3
96
- # ----
97
- """
98
- for en_c, ar_c in countries.items():
99
- if en_c in key2 and ar_c in value2:
100
- key3 = key2.replace(en_c, "country")
101
- value3 = value2.replace(ar_c, "country")
102
- # ---
103
- data_lists["cats_2000_contry"][key3] = value3
104
- """
105
  # ----
106
-
 
 
107
 
108
  print(f"all data len: {len(data):,}.")
109
 
@@ -118,7 +127,7 @@ datasets_list = {
118
  for x in to_work:
119
  data_list = data_lists.get(x)
120
  # ---
121
- data_list = [{"en": key, "ar": value} for key, value in data_list.items()]
122
 
123
  # حفظ القاموس المصحح في ملف JSON
124
  with open(f"{x}.json", "w", encoding="utf-8") as f:
@@ -127,7 +136,9 @@ for x in to_work:
127
  print("______________")
128
  print(f"file: {x} uploaded successfully!")
129
  print(f"len of {x} : {len(data_list)}.")
130
- #continue
 
 
131
  upload_file(
132
  path_or_fileobj=f"{x}.json", # اسم الملف الذي تم حفظه
133
  path_in_repo=f"{x}.json", # المسار داخل المستودع
 
24
  # تحويل القاموس إلى قائمة من القواميس [{ "en": "value", "ar": "value" }, ...]
25
 
26
  to_work = [
27
+ "langlinks",
28
+ "filtered_data",
29
  "cats_2000_contry",
30
+ "cats_2000",
31
  ]
32
 
33
  data_lists = {
 
44
  # نبني تعبير regex
45
  regex_pattern = r'\b(' + '|'.join(map(re.escape, sorted_keys)) + r')\b'
46
 
47
+ YEAR_PATTERN = "{YEAR}"
48
+ COUNTRY_PATTERN = "{COUNTRY}"
49
 
50
  for tab in tqdm.tqdm(data):
51
  # ---
 
63
  if value.startswith(':"') and value.endswith('",'):
64
  value = value[2:-2]
65
  # ----
66
+ # data_lists["langlinks"].append({"en": key, "ar": value, "count": 0})
67
+ if key in data_lists["langlinks"]:
68
+ data_lists["langlinks"][key]["count"] += 1
69
+ else:
70
+ data_lists["langlinks"][key] = {"ar": value, "count": 0}
71
  # ----
72
  # Add if key and value has 4 digits and they are the same
73
+ reg_year = r"(\d+[–-]\d+|\d{4})"
74
+ # ---
75
+ key_digits = re.search(reg_year, key)
76
+ value_digits = re.search(reg_year, value)
77
  # ----
78
  if key_digits and value_digits and key_digits.group() == value_digits.group():
79
+ # data_lists["filtered_data"].append({"en": key, "ar": value, "count": 0})
80
+ if key in data_lists["filtered_data"]:
81
+ data_lists["filtered_data"][key]["count"] += 1
82
+ else:
83
+ data_lists["filtered_data"][key] = {"ar": value, "count": 0}
84
+ # ---
85
+ key2 = key.replace(key_digits.group(), YEAR_PATTERN)
86
+ value2 = value.replace(value_digits.group(), YEAR_PATTERN)
87
  # ---
88
+ # data_lists["cats_2000"].append({"en": key2, "ar": value2, "count": 0})
 
89
  # ---
90
+ if key2 in data_lists["cats_2000"]:
91
+ data_lists["cats_2000"][key2]["count"] += 1
92
+ else:
93
+ data_lists["cats_2000"][key2] = {"ar": value2, "count": 0}
94
  # ----
95
  # البحث عن اسم الدولة في key2
96
  match = re.search(regex_pattern, key2)
 
100
  ar_country = countries.get(en_country)
101
  # ---
102
  if ar_country and ar_country in value2:
103
+ key3 = re.sub(rf'\b{re.escape(en_country)}\b', COUNTRY_PATTERN, key2)
104
+ value3 = re.sub(rf'\b{re.escape(ar_country)}\b', COUNTRY_PATTERN, value2)
105
  # ---
106
+ if COUNTRY_PATTERN in key3 and COUNTRY_PATTERN in value3:
 
107
  # ---
108
+ if key3 in data_lists["cats_2000_contry"]:
109
+ data_lists["cats_2000_contry"][key3]["count"] += 1
110
+ else:
111
+ data_lists["cats_2000_contry"][key3] = {"ar": value3, "count": 0}
 
 
 
 
 
 
112
  # ----
113
+ # ----
114
+ print(f"{len(data_lists['cats_2000_contry'])=}")
115
+ print(f"{len(data_lists['cats_2000'])=}")
116
 
117
  print(f"all data len: {len(data):,}.")
118
 
 
127
  for x in to_work:
128
  data_list = data_lists.get(x)
129
  # ---
130
+ data_list = [{"en": key, "ar": value["ar"], "count": value["count"]} for key, value in data_list.items()]
131
 
132
  # حفظ القاموس المصحح في ملف JSON
133
  with open(f"{x}.json", "w", encoding="utf-8") as f:
 
136
  print("______________")
137
  print(f"file: {x} uploaded successfully!")
138
  print(f"len of {x} : {len(data_list)}.")
139
+ # ---
140
+ # continue
141
+ # ---
142
  upload_file(
143
  path_or_fileobj=f"{x}.json", # اسم الملف الذي تم حفظه
144
  path_in_repo=f"{x}.json", # المسار داخل المستودع