File size: 5,730 Bytes
8dc5272 7006d9b 8dc5272 d6d2307 8dc5272 675327e 8dc5272 6e8a74c 8dc5272 f012e57 8dc5272 7006d9b 4cf98f3 a3e5844 55da6d3 4cf98f3 7006d9b a3e5844 55da6d3 7006d9b 675327e a3e5844 675327e b5fe413 675327e f012e57 8dc5272 7006d9b b5fe413 7006d9b 55da6d3 b5fe413 55da6d3 b5fe413 7006d9b 55da6d3 7006d9b 55da6d3 b5fe413 55da6d3 f012e57 675327e b5fe413 675327e b5fe413 46e3596 55da6d3 b5fe413 55da6d3 675327e b5fe413 55da6d3 7006d9b a814d50 4cf98f3 a3e5844 dadb4f6 55da6d3 a814d50 b5fe413 55da6d3 7006d9b 55da6d3 4cf98f3 55da6d3 d6d2307 55da6d3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 |
import tqdm
import re
import json
import requests
from huggingface_hub import login
from huggingface_hub import upload_file
from datasets import Dataset
from google.colab import userdata
# تسجيل الدخول إلى Hugging Face (استبدل "YOUR_ACCESS_TOKEN" بالتوكن الخاص بك)
login(userdata.get('HF_TOKEN'))
# تحميل الملف JSON من الرابط مباشرة
json_url = "https://huggingface.co/Ibrahemqasim/enwiki_to_arwiki_categories/resolve/main/langlinks.json"
response = requests.get(json_url)
data = response.json()
# تحميل الملف JSON من الرابط مباشرة
json_url2 = "https://huggingface.co/Ibrahemqasim/enwiki_to_arwiki_categories/resolve/main/countries.json"
response2 = requests.get(json_url2)
countries = response2.json()
# تحويل القاموس إلى قائمة من القواميس [{ "en": "value", "ar": "value" }, ...]
to_work = [
"countries",
"categories_with_years",
"categories_with_YEAR_COUNTRY_pattern",
"categories_with_YEAR_pattern",
]
data_lists = {
"countries" : {},
"categories_with_years" : {},
"categories_with_YEAR_COUNTRY_pattern" : {},
"categories_with_YEAR_pattern" : {},
}
# نرتب الدول حسب عدد الفراغات (تنازليًا)
sorted_keys = sorted(countries.keys(), key=lambda x: -x.count(' '))
data_lists["countries"] = dict(sorted(countries.items(), key=lambda x: -x[1].count(' ')))
# نبني تعبير regex
regex_pattern = r'\b(' + '|'.join(map(re.escape, sorted_keys)) + r')\b'
YEAR_PATTERN = "{YEAR}"
COUNTRY_PATTERN = "{COUNTRY}"
for tab in tqdm.tqdm(data):
# ---
key = tab["en"]
value = tab["ar"]
# ---
# "Category:1. FC Köln non-playing staff"
# remove " from start and end
# ---
if key.startswith('"') and key.endswith('"'):
key = key[1:-1]
# ----
# remove (:") from start and remove (",) from end
# :"cc",
if value.startswith(':"') and value.endswith('",'):
value = value[2:-2]
# ----
# Add if key and value has 4 digits and they are the same
reg_year = r"(\d+[–-]\d+|\d{4})"
# ---
key_digits = re.search(reg_year, key)
value_digits = re.search(reg_year, value)
# ----
if key_digits and value_digits and key_digits.group() == value_digits.group():
# data_lists["categories_with_years"].append({"en": key, "ar": value, "count": 1})
if key in data_lists["categories_with_years"]:
data_lists["categories_with_years"][key]["count"] += 1
else:
data_lists["categories_with_years"][key] = {"ar": value, "count": 1}
# ---
key2 = key.replace(key_digits.group(), YEAR_PATTERN)
value2 = value.replace(value_digits.group(), YEAR_PATTERN)
# ---
# data_lists["categories_with_YEAR_pattern"].append({"en": key2, "ar": value2, "count": 1})
# ---
if key2 in data_lists["categories_with_YEAR_pattern"]:
data_lists["categories_with_YEAR_pattern"][key2]["count"] += 1
else:
data_lists["categories_with_YEAR_pattern"][key2] = {"ar": value2, "count": 1}
# ----
# البحث عن اسم الدولة في key2
match = re.search(regex_pattern, key2)
# ----
if match:
en_country = match.group(1)
ar_country = countries.get(en_country)
# ---
if ar_country and ar_country in value2:
key3 = re.sub(rf'\b{re.escape(en_country)}\b', COUNTRY_PATTERN, key2)
value3 = re.sub(rf'\b{re.escape(ar_country)}\b', COUNTRY_PATTERN, value2)
# ---
if COUNTRY_PATTERN in key3 and COUNTRY_PATTERN in value3:
# ---
if key3 in data_lists["categories_with_YEAR_COUNTRY_pattern"]:
data_lists["categories_with_YEAR_COUNTRY_pattern"][key3]["count"] += 1
else:
data_lists["categories_with_YEAR_COUNTRY_pattern"][key3] = {"ar": value3, "count": 1}
# ----
# ----
print(f"{len(data_lists['categories_with_YEAR_COUNTRY_pattern'])=}")
print(f"{len(data_lists['categories_with_YEAR_pattern'])=}")
print(f"all data len: {len(data):,}.")
# for x, data_list in data_lists.items():
for x in to_work:
data_list = data_lists.get(x)
# ---
if x == "countries":
data_list = [{"en": key, "ar": value} for key, value in data_list.items()]
else:
data_list = [{"en": key, "ar": value["ar"], "count": value["count"]} for key, value in data_list.items()]
# sort data_list by count
data_list = sorted(data_list, key=lambda x: x["count"], reverse=True)
# ---
print("______________")
print(f"len of {x} : {len(data_list)}.")
# ---
# continue
# ---
'''
# حفظ القاموس المصحح في ملف JSON
with open(f"{x}.json", "w", encoding="utf-8") as f:
json.dump(data_list, f, ensure_ascii=False, indent=4)
# ---
print(f"file: {x} uploaded successfully!")
# ---
upload_file(
path_or_fileobj=f"{x}.json", # اسم الملف الذي تم حفظه
path_in_repo=f"{x}.json", # المسار داخل المستودع
repo_id="Ibrahemqasim/enwiki_to_arwiki_categories", # معرف المستودع
# repo_type="dataset", # نوع المستودع (نستخدم dataset للملفات)
)
'''
# ---
print("____________________________")
# ---
# إنشاء Dataset
dataset = Dataset.from_list(data_list)
# رفع Dataset إلى Hugging Face
dataset.push_to_hub(f"Ibrahemqasim/{x}")
# ---
print(f"dataset: Ibrahemqasim/{x} push_to_hub successfully!") |