Ibrahemqasim commited on
Commit
675327e
·
verified ·
1 Parent(s): 440236c

Update fix langlinks.py

Browse files
Files changed (1) hide show
  1. fix langlinks.py +34 -5
fix langlinks.py CHANGED
@@ -6,8 +6,10 @@ from huggingface_hub import login
6
  from huggingface_hub import upload_file
7
  from datasets import Dataset
8
 
 
 
9
  # تسجيل الدخول إلى Hugging Face (استبدل "YOUR_ACCESS_TOKEN" بالتوكن الخاص بك)
10
- login("YOUR_ACCESS_TOKEN")
11
 
12
  # تحميل الملف JSON من الرابط مباشرة
13
  json_url = "https://huggingface.co/Ibrahemqasim/enwiki_to_arwiki_categories/resolve/main/langlinks.json"
@@ -24,8 +26,8 @@ countries = response2.json()
24
  to_work = [
25
  # "langlinks",
26
  # "filtered_data",
27
- # "cats_2000_contry",
28
- "cats_2000",
29
  ]
30
 
31
  data_lists = {
@@ -35,6 +37,14 @@ data_lists = {
35
  "cats_2000" : {},
36
  }
37
 
 
 
 
 
 
 
 
 
38
  for tab in tqdm.tqdm(data):
39
  # ---
40
  key = tab["en"]
@@ -68,12 +78,30 @@ for tab in tqdm.tqdm(data):
68
  # data_lists["cats_2000"].append({"en": key2, "ar": value2})
69
  data_lists["cats_2000"][key2] = value2
70
  # ----
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  for en_c, ar_c in countries.items():
72
  if en_c in key2 and ar_c in value2:
73
  key3 = key2.replace(en_c, "country")
74
  value3 = value2.replace(ar_c, "country")
75
  # ---
76
  data_lists["cats_2000_contry"][key3] = value3
 
 
77
 
78
 
79
  print(f"all data len: {len(data):,}.")
@@ -85,6 +113,7 @@ datasets_list = {
85
  "cats_2000" : "categories_en2ar-cats_2000",
86
  }
87
 
 
88
  # for x, data_list in data_lists.items():
89
  for x in to_work:
90
  data_list = data_lists.get(x)
@@ -98,7 +127,7 @@ for x in to_work:
98
  print("______________")
99
  print(f"file: {x} uploaded successfully!")
100
  print(f"len of {x} : {len(data_list)}.")
101
-
102
  upload_file(
103
  path_or_fileobj=f"{x}.json", # اسم الملف الذي تم حفظه
104
  path_in_repo=f"{x}.json", # المسار داخل المستودع
@@ -115,4 +144,4 @@ for x in to_work:
115
  dataset = Dataset.from_list(data_list)
116
 
117
  # رفع Dataset إلى Hugging Face
118
- dataset.push_to_hub(f"Ibrahemqasim/{set_name}")
 
6
  from huggingface_hub import upload_file
7
  from datasets import Dataset
8
 
9
+ from google.colab import userdata
10
+
11
  # تسجيل الدخول إلى Hugging Face (استبدل "YOUR_ACCESS_TOKEN" بالتوكن الخاص بك)
12
+ login(userdata.get('HF_API'))
13
 
14
  # تحميل الملف JSON من الرابط مباشرة
15
  json_url = "https://huggingface.co/Ibrahemqasim/enwiki_to_arwiki_categories/resolve/main/langlinks.json"
 
26
  to_work = [
27
  # "langlinks",
28
  # "filtered_data",
29
+ "cats_2000_contry",
30
+ # "cats_2000",
31
  ]
32
 
33
  data_lists = {
 
37
  "cats_2000" : {},
38
  }
39
 
40
+
41
+ # نرتب الدول حسب عدد الفراغات (تنازليًا)
42
+ sorted_keys = sorted(countries.keys(), key=lambda x: -x.count(' '))
43
+
44
+ # نبني تعبير regex
45
+ regex_pattern = r'\b(' + '|'.join(map(re.escape, sorted_keys)) + r')\b'
46
+
47
+
48
  for tab in tqdm.tqdm(data):
49
  # ---
50
  key = tab["en"]
 
78
  # data_lists["cats_2000"].append({"en": key2, "ar": value2})
79
  data_lists["cats_2000"][key2] = value2
80
  # ----
81
+ # البحث عن اسم الدولة في key2
82
+ match = re.search(regex_pattern, key2)
83
+ # ----
84
+ if match:
85
+ en_country = match.group(1)
86
+ ar_country = countries.get(en_country)
87
+ # ---
88
+ if ar_country and ar_country in value2:
89
+ key3 = re.sub(rf'\b{re.escape(en_country)}\b', 'country', key2)
90
+ value3 = re.sub(rf'\b{re.escape(ar_country)}\b', 'country', value2)
91
+ # ---
92
+ print(f"{key3} → {value3}")
93
+ # ---
94
+ data_lists["cats_2000_contry"][key3] = value3
95
+ # ----
96
+ """
97
  for en_c, ar_c in countries.items():
98
  if en_c in key2 and ar_c in value2:
99
  key3 = key2.replace(en_c, "country")
100
  value3 = value2.replace(ar_c, "country")
101
  # ---
102
  data_lists["cats_2000_contry"][key3] = value3
103
+ """
104
+ # ----
105
 
106
 
107
  print(f"all data len: {len(data):,}.")
 
113
  "cats_2000" : "categories_en2ar-cats_2000",
114
  }
115
 
116
+
117
  # for x, data_list in data_lists.items():
118
  for x in to_work:
119
  data_list = data_lists.get(x)
 
127
  print("______________")
128
  print(f"file: {x} uploaded successfully!")
129
  print(f"len of {x} : {len(data_list)}.")
130
+ continue
131
  upload_file(
132
  path_or_fileobj=f"{x}.json", # اسم الملف الذي تم حفظه
133
  path_in_repo=f"{x}.json", # المسار داخل المستودع
 
144
  dataset = Dataset.from_list(data_list)
145
 
146
  # رفع Dataset إلى Hugging Face
147
+ dataset.push_to_hub(f"Ibrahemqasim/{set_name}")