File size: 3,135 Bytes
3d604a5
 
 
 
 
 
 
 
8da9de1
3d604a5
 
 
 
 
 
 
 
8da9de1
3d604a5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import pandas as pd
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
import csv
import yaml
from datasets import Dataset
import os
os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = '0.0'

def load_data_and_config(data_path):
    """Loads training data from CSV."""
    data = []
    with open(data_path, newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile, delimiter=';')  # Ensure delimiter matches your CSV file
        for row in reader:
            data.append({'text': row['description']})  # Changed from 'text' to 'description'
    return data

def generate_api_query(model, tokenizer, prompt, desired_output, api_name, base_url):
    """Generates an API query using a fine-tuned model."""
    input_ids = tokenizer.encode(prompt + f" Write an API query to {api_name} to get {desired_output}", return_tensors="pt")
    output = model.generate(input_ids, max_length=256, temperature=0.7)
    query = tokenizer.decode(output[0], skip_special_tokens=True)
    return f"{base_url}/{query}"

from transformers import TrainingArguments, Trainer

def train_model(model, tokenizer, data):
    """Trains the model using the Hugging Face Trainer API."""
    # Encode data and prepare labels
    inputs = [tokenizer(d['text'], max_length=512, truncation=True, padding='max_length', return_tensors="pt") for d in data]
    dataset = Dataset.from_dict({
        'input_ids': [x['input_ids'].squeeze() for x in inputs],  # remove extra dimensions
        'labels': [x['input_ids'].squeeze() for x in inputs]
    })
    
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=3,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=1,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        tokenizer=tokenizer
    )

    # The Trainer handles the training loop internally
    trainer.train()

    # Optionally clear cache if using GPU or MPS
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    elif torch.has_mps:
        torch.mps.empty_cache()

    # Perform any remaining steps such as logging, saving, etc.
    trainer.save_model()

if __name__ == "__main__":
    # Load data and configurations
    data = load_data_and_config("train2.csv")

    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained("google/codegemma-7b-it")
    model = AutoModelForCausalLM.from_pretrained("google/codegemma-7b-it")

    # Train the model on your dataset
    train_model(model, tokenizer, data)

    # Save the fine-tuned model
    model.save_pretrained("./fine_tuned_model")
    tokenizer.save_pretrained("./fine_tuned_model")

    # Example usage
    prompt = "I need to retrieve the latest block on chain using a python script"
    api_query = generate_api_query(model, tokenizer, prompt, "latest block on chain", config["api_name"], config["base_url"])
    print(f"Generated code: {api_query}")