DataHubHub / components /fine_tuning /model_interface.py
whackthejacker's picture
Upload 34 files
43b66f1 verified
"""
Hugging Face model interface for code generation fine-tuning.
"""
import streamlit as st
import pandas as pd
import torch
from transformers import (
AutoTokenizer,
AutoModelForSeq2SeqLM,
Trainer,
TrainingArguments,
DataCollatorForSeq2Seq,
)
from datasets import Dataset
import numpy as np
import time
import os
from pathlib import Path
import uuid
import json
@st.cache_resource(show_spinner=False)
def load_model_and_tokenizer(model_name):
"""
Load a pre-trained model and tokenizer from Hugging Face.
Args:
model_name: Name of the model on Hugging Face (e.g., 'Salesforce/codet5-base')
Returns:
Tuple of (tokenizer, model)
"""
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
return tokenizer, model
def preprocess_code_dataset(dataset_df, tokenizer, max_input_length=256, max_target_length=256, task_prefix=""):
"""
Preprocess the code dataset for fine-tuning.
Args:
dataset_df: Pandas DataFrame with 'input' and 'target' columns
tokenizer: HuggingFace tokenizer
max_input_length: Maximum length for input sequences
max_target_length: Maximum length for target sequences
task_prefix: Prefix to add to inputs (e.g., "translate code to comment: ")
Returns:
HuggingFace Dataset ready for training
"""
def preprocess_function(examples):
inputs = [task_prefix + text for text in examples["input"]]
targets = examples["target"]
model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding="max_length")
# Set up the tokenizer for targets
with tokenizer.as_target_tokenizer():
labels = tokenizer(targets, max_length=max_target_length, truncation=True, padding="max_length")
model_inputs["labels"] = labels["input_ids"]
return model_inputs
# Convert DataFrame to HuggingFace Dataset
hf_dataset = Dataset.from_pandas(dataset_df)
# Split dataset into train and validation
splits = hf_dataset.train_test_split(test_size=0.1)
train_dataset = splits["train"]
eval_dataset = splits["test"]
# Apply preprocessing
train_dataset = train_dataset.map(
preprocess_function,
batched=True,
remove_columns=["input", "target"]
)
eval_dataset = eval_dataset.map(
preprocess_function,
batched=True,
remove_columns=["input", "target"]
)
return train_dataset, eval_dataset
def setup_trainer(model, tokenizer, train_dataset, eval_dataset, output_dir, training_args):
"""
Set up the Trainer for fine-tuning.
Args:
model: HuggingFace model
tokenizer: HuggingFace tokenizer
train_dataset: Preprocessed training dataset
eval_dataset: Preprocessed evaluation dataset
output_dir: Directory to save model and checkpoints
training_args: Dictionary of training arguments
Returns:
HuggingFace Trainer
"""
# Define training arguments
args = TrainingArguments(
output_dir=output_dir,
per_device_train_batch_size=training_args.get("batch_size", 8),
per_device_eval_batch_size=training_args.get("batch_size", 8),
learning_rate=training_args.get("learning_rate", 5e-5),
num_train_epochs=training_args.get("epochs", 3),
weight_decay=training_args.get("weight_decay", 0.01),
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
push_to_hub=False,
gradient_accumulation_steps=training_args.get("gradient_accumulation", 1),
warmup_steps=training_args.get("warmup_steps", 100),
logging_dir=os.path.join(output_dir, "logs"),
logging_steps=10,
)
# Data collator
data_collator = DataCollatorForSeq2Seq(
tokenizer,
model=model,
label_pad_token_id=tokenizer.pad_token_id,
pad_to_multiple_of=8
)
# Initialize Trainer
trainer = Trainer(
model=model,
args=args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
tokenizer=tokenizer,
data_collator=data_collator,
)
return trainer
def generate_code_comment(model, tokenizer, code, max_length=100, task_prefix="translate code to comment: "):
"""
Generate a comment for a given code snippet.
Args:
model: Fine-tuned model
tokenizer: Tokenizer
code: Input code snippet
max_length: Maximum length of the generated comment
task_prefix: Prefix to add to the input
Returns:
Generated comment as string
"""
inputs = tokenizer(task_prefix + code, return_tensors="pt", padding=True, truncation=True)
# Move inputs to the same device as model
device = model.device
inputs = {k: v.to(device) for k, v in inputs.items()}
# Generate
output_ids = model.generate(
inputs["input_ids"],
max_length=max_length,
num_beams=4,
early_stopping=True
)
comment = tokenizer.decode(output_ids[0], skip_special_tokens=True)
return comment
def generate_code_from_comment(model, tokenizer, comment, max_length=200, task_prefix="translate comment to code: "):
"""
Generate code from a given comment/description.
Args:
model: Fine-tuned model
tokenizer: Tokenizer
comment: Input comment or description
max_length: Maximum length of the generated code
task_prefix: Prefix to add to the input
Returns:
Generated code as string
"""
inputs = tokenizer(task_prefix + comment, return_tensors="pt", padding=True, truncation=True)
# Move inputs to the same device as model
device = model.device
inputs = {k: v.to(device) for k, v in inputs.items()}
# Generate
output_ids = model.generate(
inputs["input_ids"],
max_length=max_length,
num_beams=4,
early_stopping=True
)
code = tokenizer.decode(output_ids[0], skip_special_tokens=True)
return code
def save_training_config(output_dir, config):
"""
Save training configuration to a JSON file.
Args:
output_dir: Directory to save the configuration
config: Dictionary with training configuration
"""
config_path = os.path.join(output_dir, "training_config.json")
with open(config_path, "w") as f:
json.dump(config, f, indent=2)
def load_training_config(model_dir):
"""
Load training configuration from a JSON file.
Args:
model_dir: Directory with the saved model
Returns:
Dictionary with training configuration
"""
config_path = os.path.join(model_dir, "training_config.json")
if os.path.exists(config_path):
with open(config_path, "r") as f:
return json.load(f)
return {}