|
import tqdm |
|
import re |
|
import json |
|
import requests |
|
from huggingface_hub import login |
|
from huggingface_hub import upload_file |
|
|
|
|
|
login("YOUR_ACCESS_TOKEN") |
|
|
|
|
|
json_url = "https://huggingface.co/Ibrahemqasim/enwiki_to_arwiki_categories/resolve/main/langlinks.json" |
|
response = requests.get(json_url) |
|
data = response.json() |
|
|
|
|
|
|
|
data_lists = { |
|
"langlinks" : {}, |
|
"filtered_data" : {}, |
|
"cats_2000" : {}, |
|
} |
|
|
|
for key, value in tqdm.tqdm(data.items()): |
|
|
|
|
|
|
|
if key.startswith('"') and key.endswith('"'): |
|
key = key[1:-1] |
|
|
|
|
|
|
|
if value.startswith(':"') and value.endswith('",'): |
|
value = value[2:-2] |
|
|
|
|
|
data_lists["langlinks"][key] = value |
|
|
|
|
|
key_digits = re.search(r"\d{4}", key) |
|
value_digits = re.search(r"\d{4}", value) |
|
|
|
if key_digits and value_digits and key_digits.group() == value_digits.group(): |
|
|
|
data_lists["filtered_data"][key] = value |
|
|
|
key2 = key.replace(key_digits.group(), "2000") |
|
value2 = value.replace(value_digits.group(), "2000") |
|
|
|
|
|
data_lists["cats_2000"][key] = value |
|
|
|
|
|
for x, data_list in data_lists.items(): |
|
data_list = [{"en": key, "ar": value} for key, value in data_list.items()] |
|
|
|
|
|
with open(f"{x}.json", "w", encoding="utf-8") as f: |
|
json.dump(data_list, f, ensure_ascii=False, indent=4) |
|
|
|
upload_file( |
|
path_or_fileobj=f"{x}.json", |
|
path_in_repo=f"{x}.json", |
|
repo_id="Ibrahemqasim/enwiki_to_arwiki_categories", |
|
|
|
) |
|
|
|
print(f"__________________") |
|
print(f"file: {x} uploaded successfully!") |
|
|
|
print(f"{len(data)=}.") |
|
print(f"{len(data_list)} rows uploaded.") |
|
|