enwiki_to_arwiki_categories / make_nationalities.py
Ibrahemqasim's picture
Update make_nationalities.py
39a1891 verified
raw
history blame
1.38 kB
import tqdm
import re
import requests
from huggingface_hub import login
from datasets import Dataset
from datasets import load_dataset
from google.colab import userdata
login(userdata.get('HF_TOKEN'))
nats_url = "https://raw.githubusercontent.com/MrIbrahem/Nationalities/refs/heads/main/nats.json"
response = requests.get(nats_url)
nationalities = response.json()
data_list = []
# ---
skip = [
"barbadian_2",
"west india !",
"democratic republic of the congo",
]
# ---
for x in nationalities["data"]:
if x["nat"] in skip:
continue
# ---
if x["nat"].replace("-", " ").replace("the ", "").lower() == x["en"].replace("-", " ").replace("the ", "").lower():
continue
# ---
data_list.append({
"nat_en": x["nat"],
"man": x["men"],
"men": x["mens"],
"women": x["women"],
"womens": x["womens"],
"country_en": x["en"],
"country_ar": x["ar"],
})
# ---
data_list = sorted(data_list, key=lambda x: -x["nat_en"].count(' '))
# ---
print("______________")
print(f"len of nationalities : {len(data_list)}.")
# ---
print("____________________________")
# ---
# إنشاء Dataset
dataset = Dataset.from_list(data_list)
# رفع Dataset إلى Hugging Face
dataset.push_to_hub("Ibrahemqasim/nationalities")
# ---
print("dataset: Ibrahemqasim/nationalities push_to_hub successfully!")