File size: 1,378 Bytes
2363599 f15a0b5 54fd4e7 39a1891 54fd4e7 f15a0b5 54fd4e7 39a1891 54fd4e7 f15a0b5 2363599 38f99e1 2363599 38f99e1 2363599 ee8f21a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
import tqdm
import re
import requests
from huggingface_hub import login
from datasets import Dataset
from datasets import load_dataset
from google.colab import userdata
login(userdata.get('HF_TOKEN'))
nats_url = "https://raw.githubusercontent.com/MrIbrahem/Nationalities/refs/heads/main/nats.json"
response = requests.get(nats_url)
nationalities = response.json()
data_list = []
# ---
skip = [
"barbadian_2",
"west india !",
"democratic republic of the congo",
]
# ---
for x in nationalities["data"]:
if x["nat"] in skip:
continue
# ---
if x["nat"].replace("-", " ").replace("the ", "").lower() == x["en"].replace("-", " ").replace("the ", "").lower():
continue
# ---
data_list.append({
"nat_en": x["nat"],
"man": x["men"],
"men": x["mens"],
"women": x["women"],
"womens": x["womens"],
"country_en": x["en"],
"country_ar": x["ar"],
})
# ---
data_list = sorted(data_list, key=lambda x: -x["nat_en"].count(' '))
# ---
print("______________")
print(f"len of nationalities : {len(data_list)}.")
# ---
print("____________________________")
# ---
# إنشاء Dataset
dataset = Dataset.from_list(data_list)
# رفع Dataset إلى Hugging Face
dataset.push_to_hub("Ibrahemqasim/nationalities")
# ---
print("dataset: Ibrahemqasim/nationalities push_to_hub successfully!")
|