Spaces:
Sleeping
Sleeping
import os | |
import json | |
import gradio as gr | |
import torch | |
from transformers import ( | |
TrainingArguments, | |
Trainer, | |
AutoModelForCausalLM, | |
AutoTokenizer, | |
DataCollatorForLanguageModeling | |
) | |
from datasets import Dataset | |
from peft import ( | |
prepare_model_for_kbit_training, | |
LoraConfig, | |
get_peft_model | |
) | |
# Constants | |
MODEL_NAME = "deepseek-ai/DeepSeek-R1" | |
OUTPUT_DIR = "finetuned_models" | |
LOGS_DIR = "training_logs" | |
def save_uploaded_file(file): | |
"""Save uploaded file and return its path""" | |
os.makedirs('uploads', exist_ok=True) | |
file_path = os.path.join('uploads', file.name) | |
with open(file_path, 'wb') as f: | |
f.write(file.read()) | |
return file_path | |
def prepare_training_components( | |
data_path, | |
learning_rate, | |
num_epochs, | |
batch_size, | |
model_name=MODEL_NAME | |
): | |
"""Prepare model, tokenizer, and training arguments""" | |
# Create output directory with timestamp | |
import time | |
timestamp = time.strftime("%Y%m%d_%H%M%S") | |
specific_output_dir = os.path.join(OUTPUT_DIR, f"run_{timestamp}") | |
os.makedirs(specific_output_dir, exist_ok=True) | |
os.makedirs(LOGS_DIR, exist_ok=True) | |
# Load tokenizer and model | |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) | |
model = AutoModelForCausalLM.from_pretrained( | |
model_name, | |
trust_remote_code=True, | |
torch_dtype=torch.float16, | |
load_in_8bit=True | |
) | |
# LoRA Configuration | |
lora_config = LoraConfig( | |
r=16, | |
lora_alpha=32, | |
target_modules=[ | |
"q_proj", "k_proj", "v_proj", "o_proj", | |
"gate_proj", "up_proj", "down_proj" | |
], | |
lora_dropout=0.05, | |
bias="none", | |
task_type="CAUSAL_LM" | |
) | |
# Prepare model | |
model = prepare_model_for_kbit_training(model) | |
model = get_peft_model(model, lora_config) | |
# Training Arguments | |
training_args = TrainingArguments( | |
output_dir=specific_output_dir, | |
num_train_epochs=num_epochs, | |
per_device_train_batch_size=batch_size, | |
learning_rate=learning_rate, | |
fp16=True, | |
gradient_accumulation_steps=8, | |
gradient_checkpointing=True, | |
logging_dir=os.path.join(LOGS_DIR, f"run_{timestamp}"), | |
logging_steps=10, | |
save_strategy="epoch", | |
evaluation_strategy="epoch", | |
save_total_limit=2, | |
) | |
# Load and prepare dataset | |
with open(data_path, 'r') as f: | |
raw_data = json.load(f) | |
# Convert to datasets format | |
dataset = Dataset.from_dict({ | |
'text': [item['text'] for item in raw_data] | |
}) | |
# Create data collator | |
data_collator = DataCollatorForLanguageModeling( | |
tokenizer=tokenizer, | |
mlm=False | |
) | |
return { | |
'model': model, | |
'tokenizer': tokenizer, | |
'training_args': training_args, | |
'dataset': dataset, | |
'data_collator': data_collator, | |
'output_dir': specific_output_dir | |
} | |
def train_model( | |
file, | |
learning_rate=2e-4, | |
num_epochs=3, | |
batch_size=4, | |
progress=gr.Progress() | |
): | |
"""Training function for Gradio interface""" | |
try: | |
# Save uploaded file | |
file_path = save_uploaded_file(file) | |
# Prepare components | |
progress(0.2, desc="Preparing training components...") | |
components = prepare_training_components( | |
file_path, | |
learning_rate, | |
num_epochs, | |
batch_size | |
) | |
# Initialize trainer | |
progress(0.4, desc="Initializing trainer...") | |
trainer = Trainer( | |
model=components['model'], | |
args=components['training_args'], | |
train_dataset=components['dataset'], | |
data_collator=components['data_collator'], | |
) | |
# Train | |
progress(0.5, desc="Training model...") | |
trainer.train() | |
# Save model and tokenizer | |
progress(0.9, desc="Saving model...") | |
trainer.save_model() | |
components['tokenizer'].save_pretrained(components['output_dir']) | |
progress(1.0, desc="Training complete!") | |
return f"Training completed! Model saved in {components['output_dir']}" | |
except Exception as e: | |
return f"Error during training: {str(e)}" | |
# Create Gradio interface | |
def create_interface(): | |
with gr.Blocks() as demo: | |
gr.Markdown("# DeepSeek-R1 Model Finetuning Interface") | |
with gr.Row(): | |
with gr.Column(): | |
file_input = gr.File( | |
label="Upload Training Data (JSON)", | |
type="binary", | |
file_types=[".json"] | |
) | |
learning_rate = gr.Slider( | |
minimum=1e-5, | |
maximum=1e-3, | |
value=2e-4, | |
label="Learning Rate" | |
) | |
num_epochs = gr.Slider( | |
minimum=1, | |
maximum=10, | |
value=3, | |
step=1, | |
label="Number of Epochs" | |
) | |
batch_size = gr.Slider( | |
minimum=1, | |
maximum=8, | |
value=4, | |
step=1, | |
label="Batch Size" | |
) | |
train_button = gr.Button("Start Training") | |
with gr.Column(): | |
output = gr.Textbox(label="Training Status") | |
train_button.click( | |
fn=train_model, | |
inputs=[file_input, learning_rate, num_epochs, batch_size], | |
outputs=output | |
) | |
gr.Markdown(""" | |
## Instructions | |
1. Upload your training data in JSON format: | |
```json | |
[ | |
{"text": "User: Question\nAssistant: Answer"}, | |
{"text": "User: Another question\nAssistant: Another answer"} | |
] | |
``` | |
2. Adjust training parameters if needed | |
3. Click 'Start Training' | |
4. Wait for training to complete | |
""") | |
return demo | |
if __name__ == "__main__": | |
# Create necessary directories | |
os.makedirs(OUTPUT_DIR, exist_ok=True) | |
os.makedirs(LOGS_DIR, exist_ok=True) | |
# Launch Gradio interface | |
demo = create_interface() | |
demo.launch(share=True) |