stakelovelace
commit from tesla
3d604a5
raw
history blame
3.14 kB
import pandas as pd
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
import csv
import yaml
from datasets import Dataset
import os
os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = '0.0'
def load_data_and_config(data_path):
"""Loads training data from CSV."""
data = []
with open(data_path, newline='', encoding='utf-8') as csvfile:
reader = csv.DictReader(csvfile, delimiter=';') # Ensure delimiter matches your CSV file
for row in reader:
data.append({'text': row['description']}) # Changed from 'text' to 'description'
return data
def generate_api_query(model, tokenizer, prompt, desired_output, api_name, base_url):
"""Generates an API query using a fine-tuned model."""
input_ids = tokenizer.encode(prompt + f" Write an API query to {api_name} to get {desired_output}", return_tensors="pt")
output = model.generate(input_ids, max_length=256, temperature=0.7)
query = tokenizer.decode(output[0], skip_special_tokens=True)
return f"{base_url}/{query}"
from transformers import TrainingArguments, Trainer
def train_model(model, tokenizer, data):
"""Trains the model using the Hugging Face Trainer API."""
# Encode data and prepare labels
inputs = [tokenizer(d['text'], max_length=512, truncation=True, padding='max_length', return_tensors="pt") for d in data]
dataset = Dataset.from_dict({
'input_ids': [x['input_ids'].squeeze() for x in inputs], # remove extra dimensions
'labels': [x['input_ids'].squeeze() for x in inputs]
})
training_args = TrainingArguments(
output_dir='./results',
num_train_epochs=3,
per_device_train_batch_size=1,
gradient_accumulation_steps=1,
warmup_steps=500,
weight_decay=0.01,
logging_dir='./logs',
logging_steps=10,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=dataset,
tokenizer=tokenizer
)
# The Trainer handles the training loop internally
trainer.train()
# Optionally clear cache if using GPU or MPS
if torch.cuda.is_available():
torch.cuda.empty_cache()
elif torch.has_mps:
torch.mps.empty_cache()
# Perform any remaining steps such as logging, saving, etc.
trainer.save_model()
if __name__ == "__main__":
# Load data and configurations
data = load_data_and_config("train2.csv")
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("google/codegemma-7b-it")
model = AutoModelForCausalLM.from_pretrained("google/codegemma-7b-it")
# Train the model on your dataset
train_model(model, tokenizer, data)
# Save the fine-tuned model
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")
# Example usage
prompt = "I need to retrieve the latest block on chain using a python script"
api_query = generate_api_query(model, tokenizer, prompt, "latest block on chain", config["api_name"], config["base_url"])
print(f"Generated code: {api_query}")