Ibrahemqasim commited on
Commit
55da6d3
·
verified ·
1 Parent(s): c4328ec

Update fix langlinks.py

Browse files
Files changed (1) hide show
  1. fix langlinks.py +36 -49
fix langlinks.py CHANGED
@@ -24,17 +24,15 @@ countries = response2.json()
24
  # تحويل القاموس إلى قائمة من القواميس [{ "en": "value", "ar": "value" }, ...]
25
 
26
  to_work = [
27
- "langlinks",
28
- "filtered_data",
29
- "cats_2000_contry",
30
- "cats_2000",
31
  ]
32
 
33
  data_lists = {
34
- "langlinks" : {},
35
- "filtered_data" : {},
36
- "cats_2000_contry" : {},
37
- "cats_2000" : {},
38
  }
39
 
40
 
@@ -63,12 +61,6 @@ for tab in tqdm.tqdm(data):
63
  if value.startswith(':"') and value.endswith('",'):
64
  value = value[2:-2]
65
  # ----
66
- # data_lists["langlinks"].append({"en": key, "ar": value, "count": 0})
67
- if key in data_lists["langlinks"]:
68
- data_lists["langlinks"][key]["count"] += 1
69
- else:
70
- data_lists["langlinks"][key] = {"ar": value, "count": 0}
71
- # ----
72
  # Add if key and value has 4 digits and they are the same
73
  reg_year = r"(\d+[–-]\d+|\d{4})"
74
  # ---
@@ -76,21 +68,21 @@ for tab in tqdm.tqdm(data):
76
  value_digits = re.search(reg_year, value)
77
  # ----
78
  if key_digits and value_digits and key_digits.group() == value_digits.group():
79
- # data_lists["filtered_data"].append({"en": key, "ar": value, "count": 0})
80
- if key in data_lists["filtered_data"]:
81
- data_lists["filtered_data"][key]["count"] += 1
82
  else:
83
- data_lists["filtered_data"][key] = {"ar": value, "count": 0}
84
  # ---
85
  key2 = key.replace(key_digits.group(), YEAR_PATTERN)
86
  value2 = value.replace(value_digits.group(), YEAR_PATTERN)
87
  # ---
88
- # data_lists["cats_2000"].append({"en": key2, "ar": value2, "count": 0})
89
  # ---
90
- if key2 in data_lists["cats_2000"]:
91
- data_lists["cats_2000"][key2]["count"] += 1
92
  else:
93
- data_lists["cats_2000"][key2] = {"ar": value2, "count": 0}
94
  # ----
95
  # البحث عن اسم الدولة في key2
96
  match = re.search(regex_pattern, key2)
@@ -105,54 +97,49 @@ for tab in tqdm.tqdm(data):
105
  # ---
106
  if COUNTRY_PATTERN in key3 and COUNTRY_PATTERN in value3:
107
  # ---
108
- if key3 in data_lists["cats_2000_contry"]:
109
- data_lists["cats_2000_contry"][key3]["count"] += 1
110
  else:
111
- data_lists["cats_2000_contry"][key3] = {"ar": value3, "count": 0}
112
  # ----
113
  # ----
114
- print(f"{len(data_lists['cats_2000_contry'])=}")
115
- print(f"{len(data_lists['cats_2000'])=}")
116
 
117
  print(f"all data len: {len(data):,}.")
118
 
119
- datasets_list = {
120
- "langlinks" : "categories_en2ar",
121
- "filtered_data" : "categories_en2ar_with_years",
122
- "cats_2000_contry" : "categories_en2ar-cats_2000_contry",
123
- "cats_2000" : "categories_en2ar-cats_2000",
124
- }
125
-
126
  # for x, data_list in data_lists.items():
127
  for x in to_work:
128
  data_list = data_lists.get(x)
129
  # ---
130
  data_list = [{"en": key, "ar": value["ar"], "count": value["count"]} for key, value in data_list.items()]
131
-
132
- # حفظ القاموس المصحح في ملف JSON
133
- with open(f"{x}.json", "w", encoding="utf-8") as f:
134
- json.dump(data_list, f, ensure_ascii=False, indent=4)
135
-
136
  print("______________")
137
- print(f"file: {x} uploaded successfully!")
138
  print(f"len of {x} : {len(data_list)}.")
139
  # ---
140
  # continue
141
  # ---
 
 
 
 
 
 
 
142
  upload_file(
143
  path_or_fileobj=f"{x}.json", # اسم الملف الذي تم حفظه
144
  path_in_repo=f"{x}.json", # المسار داخل المستودع
145
  repo_id="Ibrahemqasim/enwiki_to_arwiki_categories", # معرف المستودع
146
  # repo_type="dataset", # نوع المستودع (نستخدم dataset للملفات)
147
  )
148
-
 
149
  print("____________________________")
 
 
 
150
 
151
- set_name = datasets_list.get(x)
152
-
153
- if set_name:
154
- # إنشاء Dataset
155
- dataset = Dataset.from_list(data_list)
156
-
157
- # رفع Dataset إلى Hugging Face
158
- dataset.push_to_hub(f"Ibrahemqasim/{set_name}")
 
24
  # تحويل القاموس إلى قائمة من القواميس [{ "en": "value", "ar": "value" }, ...]
25
 
26
  to_work = [
27
+ "categories_with_years",
28
+ "categories_with_YEAR_COUNTRY_pattern",
29
+ "categories_with_YEAR_pattern",
 
30
  ]
31
 
32
  data_lists = {
33
+ "categories_with_years" : {},
34
+ "categories_with_YEAR_COUNTRY_pattern" : {},
35
+ "categories_with_YEAR_pattern" : {},
 
36
  }
37
 
38
 
 
61
  if value.startswith(':"') and value.endswith('",'):
62
  value = value[2:-2]
63
  # ----
 
 
 
 
 
 
64
  # Add if key and value has 4 digits and they are the same
65
  reg_year = r"(\d+[–-]\d+|\d{4})"
66
  # ---
 
68
  value_digits = re.search(reg_year, value)
69
  # ----
70
  if key_digits and value_digits and key_digits.group() == value_digits.group():
71
+ # data_lists["categories_with_years"].append({"en": key, "ar": value, "count": 1})
72
+ if key in data_lists["categories_with_years"]:
73
+ data_lists["categories_with_years"][key]["count"] += 1
74
  else:
75
+ data_lists["categories_with_years"][key] = {"ar": value, "count": 1}
76
  # ---
77
  key2 = key.replace(key_digits.group(), YEAR_PATTERN)
78
  value2 = value.replace(value_digits.group(), YEAR_PATTERN)
79
  # ---
80
+ # data_lists["categories_with_YEAR_pattern"].append({"en": key2, "ar": value2, "count": 1})
81
  # ---
82
+ if key2 in data_lists["categories_with_YEAR_pattern"]:
83
+ data_lists["categories_with_YEAR_pattern"][key2]["count"] += 1
84
  else:
85
+ data_lists["categories_with_YEAR_pattern"][key2] = {"ar": value2, "count": 1}
86
  # ----
87
  # البحث عن اسم الدولة في key2
88
  match = re.search(regex_pattern, key2)
 
97
  # ---
98
  if COUNTRY_PATTERN in key3 and COUNTRY_PATTERN in value3:
99
  # ---
100
+ if key3 in data_lists["categories_with_YEAR_COUNTRY_pattern"]:
101
+ data_lists["categories_with_YEAR_COUNTRY_pattern"][key3]["count"] += 1
102
  else:
103
+ data_lists["categories_with_YEAR_COUNTRY_pattern"][key3] = {"ar": value3, "count": 1}
104
  # ----
105
  # ----
106
+ print(f"{len(data_lists['categories_with_YEAR_COUNTRY_pattern'])=}")
107
+ print(f"{len(data_lists['categories_with_YEAR_pattern'])=}")
108
 
109
  print(f"all data len: {len(data):,}.")
110
 
 
 
 
 
 
 
 
111
  # for x, data_list in data_lists.items():
112
  for x in to_work:
113
  data_list = data_lists.get(x)
114
  # ---
115
  data_list = [{"en": key, "ar": value["ar"], "count": value["count"]} for key, value in data_list.items()]
116
+ # ---
 
 
 
 
117
  print("______________")
 
118
  print(f"len of {x} : {len(data_list)}.")
119
  # ---
120
  # continue
121
  # ---
122
+ '''
123
+ # حفظ القاموس المصحح في ملف JSON
124
+ with open(f"{x}.json", "w", encoding="utf-8") as f:
125
+ json.dump(data_list, f, ensure_ascii=False, indent=4)
126
+ # ---
127
+ print(f"file: {x} uploaded successfully!")
128
+ # ---
129
  upload_file(
130
  path_or_fileobj=f"{x}.json", # اسم الملف الذي تم حفظه
131
  path_in_repo=f"{x}.json", # المسار داخل المستودع
132
  repo_id="Ibrahemqasim/enwiki_to_arwiki_categories", # معرف المستودع
133
  # repo_type="dataset", # نوع المستودع (نستخدم dataset للملفات)
134
  )
135
+ '''
136
+ # ---
137
  print("____________________________")
138
+ # ---
139
+ # إنشاء Dataset
140
+ dataset = Dataset.from_list(data_list)
141
 
142
+ # رفع Dataset إلى Hugging Face
143
+ dataset.push_to_hub(f"Ibrahemqasim/{x}")
144
+ # ---
145
+ print(f"dataset: Ibrahemqasim/{x} push_to_hub successfully!")