Spaces:
Runtime error
Runtime error
import base64 | |
import importlib.util | |
import math | |
import re | |
import uuid | |
from types import ModuleType | |
from typing import Dict | |
import datasets | |
import jupytext | |
import requests | |
import streamlit as st | |
from datasets import DatasetInfo, get_dataset_infos | |
from datasets.info import DatasetInfosDict | |
from configuration import INCLUDED_USERS, TASKS_TO_PIPELINE_TAG | |
def import_from_file(module_name: str, filepath: str) -> ModuleType: | |
""" | |
Imports a module from file. | |
Args: | |
module_name (str): Assigned to the module's __name__ parameter (does not | |
influence how the module is named outside of this function) | |
filepath (str): Path to the .py file | |
Returns: | |
The module | |
""" | |
spec = importlib.util.spec_from_file_location(module_name, filepath) | |
module = importlib.util.module_from_spec(spec) | |
spec.loader.exec_module(module) | |
return module | |
def notebook_header(text: str): | |
""" | |
Insert section header into a jinja file, formatted as notebook cell. | |
Leave 2 blank lines before the header. | |
""" | |
return f"""# # {text} | |
""" | |
def code_header(text: str): | |
""" | |
Insert section header into a jinja file, formatted as Python comment. | |
Leave 2 blank lines before the header. | |
""" | |
seperator_len = (75 - len(text)) / 2 | |
seperator_len_left = math.floor(seperator_len) | |
seperator_len_right = math.ceil(seperator_len) | |
return f"# {'-' * seperator_len_left} {text} {'-' * seperator_len_right}" | |
def to_notebook(code: str) -> str: | |
"""Converts Python code to Jupyter notebook format.""" | |
notebook = jupytext.reads(code, fmt="py") | |
# print(jupytext.writes(notebook, fmt="ipynb")) | |
return jupytext.writes(notebook, fmt="ipynb") | |
def download_button( | |
object_to_download: str, download_filename: str, button_text: str # , pickle_it=False | |
): | |
""" | |
Generates a link to download the given object_to_download. | |
From: https://discuss.streamlit.io/t/a-download-button-with-custom-css/4220 | |
Params: | |
------ | |
object_to_download: The object to be downloaded. | |
download_filename (str): filename and extension of file. e.g. mydata.csv, | |
some_txt_output.txt download_link_text (str): Text to display for download | |
link. | |
button_text (str): Text to display on download button (e.g. 'click here to download file') | |
pickle_it (bool): If True, pickle file. | |
Returns: | |
------- | |
(str): the anchor tag to download object_to_download | |
Examples: | |
-------- | |
download_link(your_df, 'YOUR_DF.csv', 'Click to download data!') | |
download_link(your_str, 'YOUR_STRING.txt', 'Click to download text!') | |
""" | |
# try: | |
# # some strings <-> bytes conversions necessary here | |
b64 = base64.b64encode(object_to_download.encode()).decode() | |
# except AttributeError: | |
# b64 = base64.b64encode(object_to_download).decode() | |
button_uuid = str(uuid.uuid4()).replace("-", "") | |
button_id = re.sub("\d+", "", button_uuid) | |
custom_css = f""" | |
<style> | |
#{button_id} {{ | |
display: inline-flex; | |
align-items: center; | |
justify-content: center; | |
background-color: rgb(255, 255, 255); | |
color: rgb(38, 39, 48); | |
padding: .25rem .75rem; | |
position: relative; | |
text-decoration: none; | |
border-radius: 4px; | |
border-width: 1px; | |
border-style: solid; | |
border-color: rgb(230, 234, 241); | |
border-image: initial; | |
}} | |
#{button_id}:hover {{ | |
border-color: rgb(246, 51, 102); | |
color: rgb(246, 51, 102); | |
}} | |
#{button_id}:active {{ | |
box-shadow: none; | |
background-color: rgb(246, 51, 102); | |
color: white; | |
}} | |
</style> """ | |
dl_link = ( | |
custom_css | |
+ f'<a download="{download_filename}" id="{button_id}" href="data:file/txt;base64,{b64}">{button_text}</a><br><br>' | |
) | |
st.markdown(dl_link, unsafe_allow_html=True) | |
def get_model_to_model_id() -> Dict[str, Dict[str, str]]: | |
requests.get("https://huggingface.co") | |
response = requests.get("https://huggingface.co/api/models") | |
tags = response.json() | |
model_to_model_id = {} | |
model_to_pipeline_tag = {} | |
for model in tags: | |
model_name = model['modelId'] | |
is_community_model = "/" in model_name | |
if is_community_model: | |
user = model_name.split("/")[0] | |
if user not in INCLUDED_USERS: | |
continue | |
# TODO Right now if pipiline is not defined, skip | |
if "pipeline_tag" in model: | |
model_to_model_id[model['id']] = model['modelId'] | |
model_to_pipeline_tag[model['id']] = model["pipeline_tag"] | |
return {"model_to_model_id": model_to_model_id, "model_to_pipeline_tag": model_to_pipeline_tag} | |
def get_datasets() -> Dict[str, str]: | |
english_datasets = {} | |
response = requests.get( | |
"https://huggingface.co/api/datasets?full=true&languages=en") | |
tags = response.json() | |
for dataset in tags: | |
dataset_name = dataset["id"] | |
is_community_dataset = "/" in dataset_name | |
if is_community_dataset: | |
# user = dataset_name.split("/")[0] | |
# if user in INCLUDED_USERS: | |
# english_datasets.append(dataset_name) | |
continue | |
if "cardData" not in dataset: | |
continue | |
metadata = dataset["cardData"] | |
if "languages" not in metadata: | |
continue | |
if "task_categories" not in metadata: | |
continue | |
task_is_valid = False | |
for task_category in metadata["task_categories"]: | |
if any(task_category in task for task in list(TASKS_TO_PIPELINE_TAG.values())): | |
task_is_valid = True | |
if not task_is_valid: | |
continue | |
languages = metadata["languages"] | |
if "en" in languages or "en-US" in languages: | |
english_datasets[dataset_name] = metadata["task_categories"] | |
return english_datasets | |
def get_dataset_infos_dict(dataset: str, subset: str) -> DatasetInfo: | |
return DatasetInfosDict(get_dataset_infos(dataset))[subset] | |
# https://github.com/huggingface/datasets-viewer/blob/master/run.py#L49 | |
def render_features(features): | |
# TODO redner translation object with the languages tags | |
if isinstance(features, dict): | |
return {k: render_features(v) for k, v in features.items()} | |
if isinstance(features, datasets.features.ClassLabel): | |
return features.names | |
if isinstance(features, datasets.features.Value): | |
return features.dtype | |
if isinstance(features, datasets.features.Sequence): | |
return {"[]": render_features(features.feature)} | |
return features | |