Commit 
							
							·
						
						318e969
	
1
								Parent(s):
							
							40e000b
								
docs: added FAQ section
Browse files- app.py +4 -3
- assets/image.png +0 -0
- src/distilabel_dataset_generator/faq.py +50 -0
- src/distilabel_dataset_generator/sft.py +6 -6
    	
        app.py
    CHANGED
    
    | @@ -1,10 +1,11 @@ | |
| 1 | 
             
            import gradio as gr
         | 
| 2 |  | 
| 3 | 
            -
            from src.distilabel_dataset_generator. | 
|  | |
| 4 |  | 
| 5 | 
             
            demo = gr.TabbedInterface(
         | 
| 6 | 
            -
                [ | 
| 7 | 
            -
                ["Supervised Fine-Tuning"],
         | 
| 8 | 
             
                title="⚗️ Distilabel Dataset Generator",
         | 
| 9 | 
             
                head="⚗️ Distilabel Dataset Generator",
         | 
| 10 | 
             
                theme="ParityError/Interstellar",
         | 
|  | |
| 1 | 
             
            import gradio as gr
         | 
| 2 |  | 
| 3 | 
            +
            from src.distilabel_dataset_generator.faq import app as faq_app
         | 
| 4 | 
            +
            from src.distilabel_dataset_generator.sft import app as sft_app
         | 
| 5 |  | 
| 6 | 
             
            demo = gr.TabbedInterface(
         | 
| 7 | 
            +
                [sft_app, faq_app],
         | 
| 8 | 
            +
                ["Supervised Fine-Tuning", "FAQ"],
         | 
| 9 | 
             
                title="⚗️ Distilabel Dataset Generator",
         | 
| 10 | 
             
                head="⚗️ Distilabel Dataset Generator",
         | 
| 11 | 
             
                theme="ParityError/Interstellar",
         | 
    	
        assets/image.png
    ADDED
    
    |   | 
    	
        src/distilabel_dataset_generator/faq.py
    ADDED
    
    | @@ -0,0 +1,50 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import gradio as gr
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            with gr.Blocks() as app:
         | 
| 4 | 
            +
                gr.Markdown(
         | 
| 5 | 
            +
                    """### FAQ
         | 
| 6 | 
            +
             | 
| 7 | 
            +
                    #### What is Distilabel Dataset Generator?
         | 
| 8 | 
            +
             | 
| 9 | 
            +
                    Distilabel Dataset Generator is a tool that allows you to easily create high-quality datasets for training and fine-tuning language models. It leverages the power of Distilabel and advanced language models to generate synthetic data tailored to your specific needs.
         | 
| 10 | 
            +
             | 
| 11 | 
            +
                    
         | 
| 12 | 
            +
             | 
| 13 | 
            +
                    This tool simplifies the process of creating custom datasets, enabling you to:
         | 
| 14 | 
            +
                    - Define the characteristics of your desired dataset
         | 
| 15 | 
            +
                    - Generate system prompts automatically
         | 
| 16 | 
            +
                    - Create sample datasets for quick iteration
         | 
| 17 | 
            +
                    - Produce full-scale datasets with customizable parameters
         | 
| 18 | 
            +
                    - Push your generated datasets directly to the Hugging Face Hub
         | 
| 19 | 
            +
             | 
| 20 | 
            +
                    By using Distilabel Dataset Generator, you can rapidly prototype and create datasets for various NLP tasks, accelerating your AI development process.
         | 
| 21 | 
            +
             | 
| 22 | 
            +
                    #### How is this free?
         | 
| 23 | 
            +
             | 
| 24 | 
            +
                    The current implementation is based on [Free Serverless Hugging Face Inference Endpoints](https://huggingface.co/docs/api-inference/index). They are rate limited but free to use for anyone on the Hugging Face Hub. You can re-use the underlying pipeline to generate data with other [distilabel LLM integrations](https://distilabel.argilla.io/dev/components-gallery/llms/).
         | 
| 25 | 
            +
             | 
| 26 | 
            +
                    #### What is Distilabel?
         | 
| 27 | 
            +
             | 
| 28 | 
            +
                    Distilabel is the framework for synthetic data and AI feedback for engineers who need fast, reliable and scalable pipelines based on verified research papers.
         | 
| 29 | 
            +
             | 
| 30 | 
            +
                    #### What is synthetic data?
         | 
| 31 | 
            +
             | 
| 32 | 
            +
                    Synthetic data is data generated by an AI model, instead of being collected from the real world.
         | 
| 33 | 
            +
             | 
| 34 | 
            +
                    #### What is AI feedback?
         | 
| 35 | 
            +
             | 
| 36 | 
            +
                    AI feedback is feedback provided by an AI model, instead of being provided by a human.
         | 
| 37 | 
            +
             | 
| 38 | 
            +
                    #### How is Distilabel different from other synthetic data generation frameworks?
         | 
| 39 | 
            +
             | 
| 40 | 
            +
                    Distilabel can be used for generating synthetic data and AI feedback for a wide variety of projects including traditional predictive NLP (classification, extraction, etc.), or generative and large language model scenarios (instruction following, dialogue generation, judging etc.). Distilabel's programmatic approach allows you to build scalable pipelines for data generation and AI feedback. The goal of distilabel is to accelerate your AI development by quickly generating high-quality, diverse datasets based on verified research methodologies for generating and judging with AI feedback. So, Distilabel is focused and specifically designed to be a tool that for scalable and reliable synthetic data generation.
         | 
| 41 | 
            +
             | 
| 42 | 
            +
                    #### What do people use Distilabel for?
         | 
| 43 | 
            +
             | 
| 44 | 
            +
                    The Argilla community uses distilabel to create amazing [datasets](https://huggingface.co/datasets?other=distilabel) and [models](https://huggingface.co/models?other=distilabel).
         | 
| 45 | 
            +
             | 
| 46 | 
            +
                    - The [1M OpenHermesPreference](https://huggingface.co/datasets/argilla/OpenHermesPreferences) is a dataset of ~1 million AI preferences derived from teknium/OpenHermes-2.5. It shows how we can use Distilabel to **synthesize data on an immense scale**.
         | 
| 47 | 
            +
                    - Our [distilabeled Intel Orca DPO dataset](https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs) and the [improved OpenHermes model](https://huggingface.co/argilla/distilabeled-OpenHermes-2.5-Mistral-7B), show how we **improve model performance by filtering out 50%** of the original dataset through **AI feedback**.
         | 
| 48 | 
            +
                    - The [haiku DPO data](https://github.com/davanstrien/haiku-dpo) outlines how anyone can create a **dataset for a specific task** and **the latest research papers** to improve the quality of the dataset.
         | 
| 49 | 
            +
                    """
         | 
| 50 | 
            +
                )
         | 
    	
        src/distilabel_dataset_generator/sft.py
    CHANGED
    
    | @@ -262,10 +262,12 @@ with gr.Blocks( | |
| 262 | 
             
                title="⚗️ Distilabel Dataset Generator",
         | 
| 263 | 
             
                head="⚗️ Distilabel Dataset Generator",
         | 
| 264 | 
             
                css=get_css(),
         | 
| 265 | 
            -
            ) as  | 
| 266 | 
             
                gr.Markdown(
         | 
| 267 | 
             
                    """
         | 
| 268 | 
            -
            ### Generate a high quality SFT dataset in a breeze using [🐦⬛MagPie](https://arxiv.org/abs/2406.08464) and [🦙Llama 3.1 - 70B Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct). | 
|  | |
|  | |
| 269 | 
             
            """
         | 
| 270 | 
             
                )
         | 
| 271 | 
             
                with gr.Row(variant="panel"):
         | 
| @@ -346,7 +348,5 @@ with gr.Blocks( | |
| 346 | 
             
                            ],
         | 
| 347 | 
             
                        )
         | 
| 348 |  | 
| 349 | 
            -
                 | 
| 350 | 
            -
                 | 
| 351 | 
            -
             | 
| 352 | 
            -
            demo
         | 
|  | |
| 262 | 
             
                title="⚗️ Distilabel Dataset Generator",
         | 
| 263 | 
             
                head="⚗️ Distilabel Dataset Generator",
         | 
| 264 | 
             
                css=get_css(),
         | 
| 265 | 
            +
            ) as app:
         | 
| 266 | 
             
                gr.Markdown(
         | 
| 267 | 
             
                    """
         | 
| 268 | 
            +
            ### Generate a high quality SFT dataset in a breeze using [🐦⬛MagPie](https://arxiv.org/abs/2406.08464) and [🦙Llama 3.1 - 70B Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct).
         | 
| 269 | 
            +
             | 
| 270 | 
            +
            More information on distilabel and techniques can be found in the "FAQ" tab. The code can be found in the [Spaces repository](https://huggingface.co/spaces/argilla/distilabel-dataset-generator/tree/main).
         | 
| 271 | 
             
            """
         | 
| 272 | 
             
                )
         | 
| 273 | 
             
                with gr.Row(variant="panel"):
         | 
|  | |
| 348 | 
             
                            ],
         | 
| 349 | 
             
                        )
         | 
| 350 |  | 
| 351 | 
            +
                app.load(get_org_dropdown, outputs=[orgs_selector])
         | 
| 352 | 
            +
                app.load(fn=swap_visibilty, outputs=main_ui)
         | 
|  | |
|  | 
 
			
