Spaces:
Runtime error
Runtime error
File size: 3,135 Bytes
3d604a5 8da9de1 3d604a5 8da9de1 3d604a5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
import pandas as pd
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
import csv
import yaml
from datasets import Dataset
import os
os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = '0.0'
def load_data_and_config(data_path):
"""Loads training data from CSV."""
data = []
with open(data_path, newline='', encoding='utf-8') as csvfile:
reader = csv.DictReader(csvfile, delimiter=';') # Ensure delimiter matches your CSV file
for row in reader:
data.append({'text': row['description']}) # Changed from 'text' to 'description'
return data
def generate_api_query(model, tokenizer, prompt, desired_output, api_name, base_url):
"""Generates an API query using a fine-tuned model."""
input_ids = tokenizer.encode(prompt + f" Write an API query to {api_name} to get {desired_output}", return_tensors="pt")
output = model.generate(input_ids, max_length=256, temperature=0.7)
query = tokenizer.decode(output[0], skip_special_tokens=True)
return f"{base_url}/{query}"
from transformers import TrainingArguments, Trainer
def train_model(model, tokenizer, data):
"""Trains the model using the Hugging Face Trainer API."""
# Encode data and prepare labels
inputs = [tokenizer(d['text'], max_length=512, truncation=True, padding='max_length', return_tensors="pt") for d in data]
dataset = Dataset.from_dict({
'input_ids': [x['input_ids'].squeeze() for x in inputs], # remove extra dimensions
'labels': [x['input_ids'].squeeze() for x in inputs]
})
training_args = TrainingArguments(
output_dir='./results',
num_train_epochs=3,
per_device_train_batch_size=1,
gradient_accumulation_steps=1,
warmup_steps=500,
weight_decay=0.01,
logging_dir='./logs',
logging_steps=10,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=dataset,
tokenizer=tokenizer
)
# The Trainer handles the training loop internally
trainer.train()
# Optionally clear cache if using GPU or MPS
if torch.cuda.is_available():
torch.cuda.empty_cache()
elif torch.has_mps:
torch.mps.empty_cache()
# Perform any remaining steps such as logging, saving, etc.
trainer.save_model()
if __name__ == "__main__":
# Load data and configurations
data = load_data_and_config("train2.csv")
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("google/codegemma-7b-it")
model = AutoModelForCausalLM.from_pretrained("google/codegemma-7b-it")
# Train the model on your dataset
train_model(model, tokenizer, data)
# Save the fine-tuned model
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")
# Example usage
prompt = "I need to retrieve the latest block on chain using a python script"
api_query = generate_api_query(model, tokenizer, prompt, "latest block on chain", config["api_name"], config["base_url"])
print(f"Generated code: {api_query}")
|