Spaces:

kkawamu1
/

huggingface_code_generator

Runtime error

App Files Files Community

huggingface_code_generator / app /components.py

kkawamu1

Commit codes

3f13a7b about 3 years ago

raw

history blame

8.32 kB

	import collections
	import os
	from typing import Dict

	import streamlit as st
	from datasets import get_dataset_config_names
	from jinja2 import Environment, FileSystemLoader

	import utils
	from configuration import OPTIMIZERS_ACCELERATE, OPTIMIZERS_TRAINER, TASKS, TASKS_TO_PIPELINE_TAG
	from utils import (get_dataset_infos_dict, get_datasets, get_model_to_model_id,
	render_features)


	def show_API_component(inputs: Dict[str, str]) -> Dict[str, str]:
	template_dict = collections.defaultdict()
	template_dirs = [
	f for f in os.scandir("templates") if f.is_dir() and f.name != "example"
	]
	template_dirs = sorted(template_dirs, key=lambda e: e.name)
	for template_dir in template_dirs:
	template_dict[template_dir.name] = template_dir.path
	st.write("## API")
	inputs['api'] = st.selectbox(

	"Which Hugging Face API do you want to use?", list(template_dict.keys())
	)
	inputs['template_dir'] = template_dict.get(inputs['api'])
	return inputs


	def show_model_component(inputs: Dict[str, str]) -> Dict[str, str]:

	model_info = get_model_to_model_id()
	models = model_info['model_to_model_id']
	models_pipeline = model_info["model_to_pipeline_tag"]
	st.write("## Model")
	models_for_task = []
	for model in models:
	if (models_pipeline[model] == inputs["nlp_task"]):
	models_for_task.append(model)
	model = st.selectbox("Which model?", list(models_for_task))
	inputs["model_checkpoint"] = models.get(model)
	inputs["pretrained"] = st.checkbox("Use pre-trained model")
	return inputs


	def show_task_component(inputs: Dict[str, str]) -> Dict[str, str]:
	st.write("## Task")
	task = st.selectbox("Which task?", TASKS)
	inputs["task"] = task
	inputs["nlp_task"] = st.selectbox(
	"Which NLP task?", TASKS_TO_PIPELINE_TAG[task])
	return inputs


	def show_input_data_component(inputs: Dict[str, str]) -> Dict[str, str]:
	st.write("## Input data")
	english_datasets = get_datasets()
	english_datasets_for_task = []

	for dataset in english_datasets:
	for task_category in english_datasets[dataset]:
	if task_category == inputs["nlp_task"]:
	english_datasets_for_task.append(dataset)
	continue

	inputs["dataset"] = st.selectbox(
	"Which one?", tuple(english_datasets_for_task)
	)

	configs = get_dataset_config_names(inputs["dataset"])
	inputs["subset"] = st.selectbox("Which subset?", list(configs))

	data_info_dict = get_dataset_infos_dict(
	inputs["dataset"], inputs["subset"])

	assert data_info_dict.splits is not None
	if 'train' in list(data_info_dict.splits.keys()):
	train_index = list(data_info_dict.splits.keys()).index('train')
	else:
	train_index = 0

	inputs["train"] = st.selectbox("Which split for training?", list(
	data_info_dict.splits.keys()), index=train_index)

	if 'validation' in list(data_info_dict.splits.keys()):
	validation_index = list(
	data_info_dict.splits.keys()).index('validation')
	else:
	validation_index = len(list(data_info_dict.splits.keys()))-1

	inputs["validation"] = st.selectbox("Which split for validation?", list(
	data_info_dict.splits.keys()), index=validation_index)

	assert data_info_dict.features is not None
	feature_index = 0
	if inputs["nlp_task"] == 'translation':
	if 'translation' in list(data_info_dict.features.keys()):
	feature_index = list(
	data_info_dict.features.keys()).index('translation')

	inputs["feature"] = st.selectbox(
	"Which data feature?", list(data_info_dict.features.keys()), feature_index)

	if inputs["feature"] == 'translation':
	inputs["source_language"] = st.selectbox(
	"Which language for source?", list(data_info_dict.features['translation'].languages))
	inputs["target_language"] = st.selectbox(
	"Which language for target?", list(data_info_dict.features['translation'].languages))

	return inputs


	def show_preprocessing_component(inputs: Dict[str, str]) -> Dict[str, str]:
	st.write("## Preprocessing")
	inputs["block_size"] = st.number_input(
	"The length of each block (i.e. context size)", 1, None, 128)

	if inputs["task"] == "MaskedLM":
	inputs["mlm_probability"] = st.number_input(
	"The probability with which to (randomly) mask tokens in the input", 0.0, 1.00, 0.15)
	inputs["whole_word_masking"] = st.checkbox(
	"Use whole word masking")
	return inputs


	def show_training_comoponent(inputs: Dict[str, str]) -> Dict[str, str]:
	st.write("## Training")

	# inputs['with_tracker'] = st.selectbox(
	# "Loggers to monitor the training ", ["none", "all", "tensorboard", "wandb", "comet_ml"])
	inputs["seed"] = st.number_input(
	"Seed", 1, None, 4)

	if inputs['api'] == 'Accelerate':
	optimizer_dict_to_use = OPTIMIZERS_ACCELERATE
	else:
	optimizer_dict_to_use = OPTIMIZERS_TRAINER

	inputs["optimizer"] = st.selectbox(
	"Optimizer", list(optimizer_dict_to_use.keys()))
	default_lr = optimizer_dict_to_use[inputs["optimizer"]]
	inputs["lr"] = st.number_input(
	"Learning rate", 0.000, None, default_lr, format="%f"
	)
	inputs["use_weight_decay"] = st.checkbox("Use weight decay")
	if inputs["use_weight_decay"]:
	inputs["weight_decay"] = st.number_input(
	"Weight decay", 0.000, None, 0.01, format="%f"
	)

	inputs["gradient_accumulation_steps"] = st.number_input(
	"Gradient Accumulation Steps", 1, None, 8)

	inputs['lr_scheduler_type'] = st.selectbox(
	"The scheduler type to use", ["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"])
	inputs['num_warmup_steps'] = st.number_input(
	"Num warmup steps", 0, None, 0)
	inputs["batch_size"] = st.number_input("Batch size", 1, None, 32)
	inputs["num_epochs"] = st.number_input("Epochs", 1, None, 3)
	return inputs


	def show_datset_view_component(inputs: Dict[str, str]) -> Dict[str, str]:
	data_info_dict = get_dataset_infos_dict(
	inputs["dataset"], inputs["subset"])
	st.write(f'## Dataset view: {inputs["dataset"]}/{inputs["subset"]}')
	st.markdown(
	"Homepage: "
	+ data_info_dict.homepage
	+ "\n\nDataset: https://github.com/huggingface/datasets/blob/master/datasets/%s/%s.py"
	% (inputs["dataset"], inputs["dataset"])
	)
	s = []
	s .append('dataset' + "=" + inputs["dataset"])
	s.append('config' + "=" + inputs["subset"])
	st.markdown(
	"Permalink: https://huggingface.co/datasets/viewer/?"
	+ "&".join(s)
	)
	# https://github.com/huggingface/datasets-viewer/blob/master/run.py#L282
	st.write(f'{data_info_dict.description}')
	st.write(render_features(data_info_dict.features))
	# TODO make a conditional if the size of the data is too big, switch to streaming mode
	# TODO cashe this part of the code
	# selected_dataset = load_dataset(
	# inputs["dataset"], inputs["subset"], split=inputs["train"], streaming=True)
	# print(selected_dataset)
	# print(next(iter(selected_dataset)))
	return inputs

	def show_code_component(inputs: Dict[str, str]) -> Dict[str, str]:
	# Generate code and notebook based on template.py.jinja file in the template dir.
	env = Environment(
	loader=FileSystemLoader(inputs['template_dir']), trim_blocks=True, lstrip_blocks=True,
	)

	template = env.get_template(f'task_templates/{inputs["nlp_task"]}.py.jinja')
	code = template.render(header=utils.code_header, notebook=False, **inputs)
	notebook_code = template.render(
	header=utils.notebook_header, notebook=True, **inputs)

	notebook = utils.to_notebook(notebook_code)

	st.write(f'## Code view: {inputs["api"]}')
	st.write("") # add vertical space
	col1, col2 = st.beta_columns(2)
	with col1:
	utils.download_button(code, "generated-code.py", "🐍 Download (.py)")
	with col2:
	utils.download_button(
	notebook, "generated-notebook.ipynb", "📓 Download (.ipynb)")
	colab_error = st.empty()
	# Display code.
	st.code(code)
	return inputs