import gradio as gr import os def create_app(): # Custom CSS for better styling custom_css = """ /* Global font size increase */ .gradio-container, .gradio-container * { font-size: 16px !important; } .main-container { max-width: 1200px; margin: 0 auto; padding: 20px; font-size: 16px; } .banner-container { text-align: center; margin-bottom: 30px; } .section-header { background: linear-gradient(90deg, #667eea 0%, #764ba2 100%); color: white; padding: 15px; border-radius: 10px; margin: 20px 0 10px 0; font-weight: bold; font-size: 1.4em !important; } .subsection-header { background: #f8f9fa; padding: 10px; border-left: 4px solid #667eea; margin: 15px 0 10px 0; font-weight: bold; font-size: 1.1em !important; color: #2c3e50; } .info-box { background: #ffffff; border: 1px solid #e9ecef; border-radius: 8px; padding: 15px; margin: 10px 0; font-size: 16px !important; color: #2c3e50; } .info-box p, .info-box li, .info-box ul { font-size: 16px !important; line-height: 1.6 !important; color: #2c3e50; } .highlight { background: #fff3cd; border: 1px solid #ffeaa7; border-radius: 5px; padding: 10px; margin: 5px 0; font-size: 16px !important; } .dataset-link { color: #667eea; text-decoration: none; font-weight: 500; font-size: 16px !important; } .dataset-link:hover { text-decoration: underline; } .checkbox-yes { color: #28a745; font-weight: bold; font-size: 16px !important; } .checkbox-no { color: #dc3545; font-weight: bold; font-size: 16px !important; } /* Ensure all text elements have larger font */ h1, h2, h3, h4, h5, h6 { font-size: 1.2em !important; } strong { font-size: inherit !important; } /* Dark mode adaptations */ @media (prefers-color-scheme: dark) { .subsection-header { background: #2d3748 !important; color: #e2e8f0 !important; } .info-box { background: #2d3748 !important; border: 1px solid #4a5568 !important; color: #e2e8f0 !important; } .info-box p, .info-box li, .info-box ul { color: #e2e8f0 !important; } .highlight { background: #553c9a !important; border: 1px solid #667eea !important; color: #e2e8f0 !important; } .dataset-link { color: #90cdf4 !important; } .checkbox-yes { color: #68d391 !important; } .checkbox-no { color: #fc8181 !important; } .tldr-box { background: linear-gradient(90deg, #2a365f 0%, #553c9a 100%) !important; color: #e2e8f0 !important; } .version-box { background: linear-gradient(90deg, #2a365f 0%, #553c9a 100%) !important; color: #e2e8f0 !important; } } /* Gradio dark mode detection - alternative approach */ .dark .subsection-header, [data-theme="dark"] .subsection-header { background: #2d3748 !important; color: #e2e8f0 !important; } .dark .info-box, [data-theme="dark"] .info-box { background: #2d3748 !important; border: 1px solid #4a5568 !important; color: #e2e8f0 !important; } .dark .info-box p, .dark .info-box li, .dark .info-box ul, [data-theme="dark"] .info-box p, [data-theme="dark"] .info-box li, [data-theme="dark"] .info-box ul { color: #e2e8f0 !important; } .dark .highlight, [data-theme="dark"] .highlight { background: #553c9a !important; border: 1px solid #667eea !important; color: #e2e8f0 !important; } .dark .dataset-link, [data-theme="dark"] .dataset-link { color: #90cdf4 !important; } .dark .checkbox-yes, [data-theme="dark"] .checkbox-yes { color: #68d391 !important; } .dark .checkbox-no, [data-theme="dark"] .checkbox-no { color: #fc8181 !important; } .dark .tldr-box, [data-theme="dark"] .tldr-box { background: linear-gradient(90deg, #2a365f 0%, #553c9a 100%) !important; color: #e2e8f0 !important; } .dark .version-box, [data-theme="dark"] .version-box { background: linear-gradient(90deg, #2a365f 0%, #553c9a 100%) !important; color: #e2e8f0 !important; } """ with gr.Blocks(css=custom_css, title="SmolLM3-3B EU Data Transparency") as app: with gr.Column(elem_classes=["main-container"]): # Banner section with images with gr.Row(): with gr.Column(scale=1): try: gr.Image("eu_flag.png", height=180, show_label=False, show_download_button=False, interactive=False, container=False) except: gr.HTML('
') # Placeholder if image not found with gr.Column(scale=1.5): gr.HTML("""

SmolLM3-3B

Public Summary of Training Content

Summary v1.0 - Last updated: 25/07/2025
""") with gr.Column(scale=1): try: gr.Image("banner.png", height=180, show_label=False, show_download_button=False, interactive=False, container=False) except: gr.HTML('
') # Placeholder if image not found gr.HTML("""

This Space contains the transparency report for the SmolLM3-3B GPAI model developed by Hugging Face following the guidelines provided by the AI Office.
It may serve as an example for open-source GPAI trained exclusively on public datasets. For more information, see the Explanatory Notice and Template

πŸ“‹ TL;DR

SmolLM3-3B is a state-of-the-art 3-billion parameter language model by Hugging Face trained on 10+ trillion tokens from publicly available datasets including web documents, scientific articles, and code. Training focused on 6 EU languages plus others. The model uses only public datasets (no commercial licensing, user data, or other private data). Data processing was done by the original component dataset curators with varied approaches to TDM and filtering that typically include compliance with robots.txt and other opt-out mechanisms, and educational content classifiers.

""") # Section 1: General Information gr.HTML('
1. General information
') gr.HTML("""

TL;DR: Provider: Hugging Face | Model: SmolLM3-3B | Training: 10+ trillion tokens, 6 EU languages + others

""") with gr.Accordion("πŸ‘‡ Click for full information", open=False): with gr.Row(): with gr.Column(): gr.HTML("""
1.1. Provider identification
""") with gr.Column(): gr.HTML("""
1.2. Model identification
""") gr.HTML("""
1.3. Modalities, overall training data size and other characteristics
""") # Section 2: Data Sources gr.HTML('
2. List of data sources
') gr.HTML("""

TL;DR: βœ… Publicly available datasets including synthetic data | ❌ No commercial licensing, crawling, user data, or private data

""") with gr.Accordion("πŸ‘‡ Click for full information", open=False): gr.HTML("""
2.1. Publicly available datasets
""") gr.HTML("""
2.2. Private non-publicly available datasets obtained from third parties

2.2.1. Datasets commercially licensed by rightsholders or their representatives

2.2.2. Private datasets obtained from other third parties

""") with gr.Row(): with gr.Column(): gr.HTML("""
2.3. Data crawled and scraped from online sources
""") with gr.Column(): gr.HTML("""
2.4. User data
""") with gr.Row(): with gr.Column(): gr.HTML("""
2.5. Synthetic data
""") with gr.Column(): gr.HTML("""
2.6. Other sources of data
""") # Section 3: Data Processing gr.HTML('
3. Data processing aspects
') gr.HTML("""

TL;DR: TDM rights: robots.txt baseline otherwise dataset-dependent | Content filtering: Dataset-dependent including educational classifiers

""") with gr.Accordion("πŸ‘‡ Click for full information", open=False): gr.HTML("""
3.1. Respect of reservation of rights from text and data mining exception or limitation
""") gr.HTML("""
3.2. Removal of illegal content
""") return app # Create the demo instance directly for Gradio auto-reload demo = create_app() if __name__ == "__main__": demo.launch(share=True, show_error=True)