|
import tqdm |
|
import re |
|
import requests |
|
from huggingface_hub import login |
|
from datasets import Dataset |
|
from datasets import load_dataset |
|
from google.colab import userdata |
|
|
|
login(userdata.get('HF_TOKEN')) |
|
|
|
nats_url = "https://raw.githubusercontent.com/MrIbrahem/Nationalities/refs/heads/main/nats.json" |
|
response = requests.get(nats_url) |
|
nationalities = response.json() |
|
|
|
data_list = [] |
|
|
|
skip = [ |
|
"barbadian_2", |
|
"west india !", |
|
"democratic republic of the congo", |
|
] |
|
|
|
for x in nationalities["data"]: |
|
if x["nat"] in skip: |
|
continue |
|
|
|
if x["nat"].replace("-", " ").replace("the ", "").lower() == x["en"].replace("-", " ").replace("the ", "").lower(): |
|
continue |
|
|
|
data_list.append({ |
|
"nat_en": x["nat"], |
|
"man": x["men"], |
|
"men": x["mens"], |
|
"women": x["women"], |
|
"womens": x["womens"], |
|
|
|
"country_en": x["en"], |
|
"country_ar": x["ar"], |
|
}) |
|
|
|
data_list = sorted(data_list, key=lambda x: -x["nat_en"].count(' ')) |
|
|
|
print("______________") |
|
print(f"len of nationalities : {len(data_list)}.") |
|
|
|
print("____________________________") |
|
|
|
|
|
dataset = Dataset.from_list(data_list) |
|
|
|
|
|
dataset.push_to_hub("Ibrahemqasim/nationalities") |
|
|
|
print("dataset: Ibrahemqasim/nationalities push_to_hub successfully!") |
|
|