File size: 5,337 Bytes
805087b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
import tqdm
import re
import requests
from huggingface_hub import login
from datasets import Dataset
from datasets import load_dataset
from google.colab import userdata
login(userdata.get('HF_TOKEN'))
data = load_dataset("Ibrahemqasim/categories_en2ar", split="train")
# nationalities.keys() "nat_en","man","men","women","womens","country_en","country_ar",
nationalities = load_dataset("Ibrahemqasim/nationalities", split="train")
nationalities_pattern = r'\b(' + '|'.join(map(re.escape, [n["nat_en"] for n in sorted(nationalities, key=lambda x: -x["nat_en"].count(' '))])) + r')\b'
countries = load_dataset("Ibrahemqasim/countries", split="train")
countries_pattern = r'\b(' + '|'.join(map(re.escape, [n["en"] for n in sorted(countries, key=lambda x: -x["en"].count(' '))])) + r')\b'
# ---
countries_dict = {cc["en"]: cc for cc in countries}
nationalities_dict = {cc["nat_en"]: cc for cc in nationalities}
# ---
to_work = [
"categories_with_nationalities",
# "categories_with_years",
# "categories_with_YEAR_COUNTRY_pattern",
# "categories_with_YEAR_pattern",
]
data_lists = {
"categories_with_nationalities" : {},
"categories_with_years" : {},
"categories_with_YEAR_COUNTRY_pattern" : {},
"categories_with_YEAR_pattern" : {},
}
YEAR_PATTERN = "{YEAR}"
NAT = "{NAT}"
AR_NAT_MEN = "{NAT_MEN}"
COUNTRY_PATTERN = "{COUNTRY}"
for tab in tqdm.tqdm(data):
# ---
key = tab["en"]
value = tab["ar"]
# ---
# Add if key and value has 4 digits and they are the same
reg_year = r"(\d+[–-]\d+|\d{4})"
# ---
key_digits = re.search(reg_year, key)
value_digits = re.search(reg_year, value)
# ----
match1 = re.search(nationalities_pattern, key)
# ----
if match1:
en_country = match1.group(1)
ar_country = nationalities_dict.get(en_country, {}).get("men", "")
# ---
if ar_country and ar_country in value:
key1 = re.sub(rf'\b{re.escape(en_country)}\b', COUNTRY_PATTERN, key)
value1 = re.sub(rf'\b{re.escape(ar_country)}\b', AR_NAT_MEN, value)
# ---
if COUNTRY_PATTERN in key1 and AR_NAT_MEN in value1:
# ---
if key1 in data_lists["categories_with_nationalities"]:
data_lists["categories_with_nationalities"][key1]["count"] += 1
else:
data_lists["categories_with_nationalities"][key1] = {"ar": value1, "count": 1}
# ---
continue
# ---
if key_digits and value_digits and key_digits.group() == value_digits.group():
# data_lists["categories_with_years"].append({"en": key, "ar": value, "count": 1})
if key in data_lists["categories_with_years"]:
data_lists["categories_with_years"][key]["count"] += 1
else:
data_lists["categories_with_years"][key] = {"ar": value, "count": 1}
# ---
key2 = key.replace(key_digits.group(), YEAR_PATTERN)
value2 = value.replace(value_digits.group(), YEAR_PATTERN)
# ---
# data_lists["categories_with_YEAR_pattern"].append({"en": key2, "ar": value2, "count": 1})
# ---
if key2 in data_lists["categories_with_YEAR_pattern"]:
data_lists["categories_with_YEAR_pattern"][key2]["count"] += 1
else:
data_lists["categories_with_YEAR_pattern"][key2] = {"ar": value2, "count": 1}
# ----
# البحث عن اسم الدولة في key2
match = re.search(countries_pattern, key2)
# ----
if match:
en_country = match.group(1)
ar_country = countries.get(en_country)
# ---
if ar_country and ar_country in value2:
key3 = re.sub(rf'\b{re.escape(en_country)}\b', COUNTRY_PATTERN, key2)
value3 = re.sub(rf'\b{re.escape(ar_country)}\b', COUNTRY_PATTERN, value2)
# ---
if COUNTRY_PATTERN in key3 and COUNTRY_PATTERN in value3:
# ---
if key3 in data_lists["categories_with_YEAR_COUNTRY_pattern"]:
data_lists["categories_with_YEAR_COUNTRY_pattern"][key3]["count"] += 1
else:
data_lists["categories_with_YEAR_COUNTRY_pattern"][key3] = {"ar": value3, "count": 1}
# ----
# ----
print(f"{len(data_lists['categories_with_YEAR_COUNTRY_pattern'])=}")
print(f"{len(data_lists['categories_with_YEAR_pattern'])=}")
print(f"all data len: {len(data):,}.")
# for x, data_list in data_lists.items():
for x in to_work:
data_list = data_lists.get(x)
# ---
if x == "countries":
data_list = [{"en": key, "ar": value} for key, value in data_list.items()]
else:
data_list = [{"en": key, "ar": value["ar"], "count": value["count"]} for key, value in data_list.items()]
# sort data_list by count
data_list = sorted(data_list, key=lambda x: x["count"], reverse=True)
# ---
print("______________")
print(f"len of {x} : {len(data_list)}.")
# ---
print("____________________________")
# ---
# إنشاء Dataset
dataset = Dataset.from_list(data_list)
# رفع Dataset إلى Hugging Face
dataset.push_to_hub(f"Ibrahemqasim/{x}")
# ---
print(f"dataset: Ibrahemqasim/{x} push_to_hub successfully!")
|