File size: 3,160 Bytes
b4ff959
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForSeq2Seq
import datasets
import torch
import json
import os
import accelerate
except ImportError:
    os.system('pip install "accelerate>=0.26.0"')

# Model setup
MODEL_ID = "facebook/opt-350m"  # Smaller, open access model
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.float16, device_map="auto")

# Function to process uploaded JSON and train
def train_ui_tars(file):
    try:
        # Step 1: Load and preprocess the uploaded JSON file
        with open(file.name, "r", encoding="utf-8") as f:
            raw_data = json.load(f)
       
        # Extract training pairs or use flat structure
        training_data = raw_data.get("training_pairs", raw_data)
       
        # Save fixed JSON to avoid issues
        fixed_json_path = "fixed_fraud_data.json"
        with open(fixed_json_path, "w", encoding="utf-8") as f:
            json.dump(training_data, f, indent=4)
       
        # Load dataset
        dataset = datasets.load_dataset("json", data_files=fixed_json_path)
       
        # Step 2: Tokenize dataset
        def tokenize_data(example):
            inputs = tokenizer(example["input"], padding="max_length", truncation=True, max_length=512)
            targets = tokenizer(example["output"], padding="max_length", truncation=True, max_length=512)
            inputs["labels"] = targets["input_ids"]
            return inputs
       
        tokenized_dataset = dataset.map(tokenize_data, batched=True)
       
        # Step 3: Training setup
        training_args = TrainingArguments(
            output_dir="./fine_tuned_llama2",
            per_device_train_batch_size=2,
            evaluation_strategy="no",
            save_strategy="epoch",
            save_total_limit=2,
            num_train_epochs=3,
            learning_rate=2e-5,
            weight_decay=0.01,
            logging_dir="./logs"
        )
       
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_dataset["train"],
            data_collator=DataCollatorForSeq2Seq(tokenizer, model=model)
        )
       
        # Step 4: Start training
        trainer.train()
       
        # Step 5: Save the model
        model.save_pretrained("./fine_tuned_llama2")
        tokenizer.save_pretrained("./fine_tuned_llama2")
       
        return "Training completed successfully! Model saved to ./fine_tuned_llama2"
   
    except Exception as e:
        return f"Error: {str(e)}"

# Gradio UI
with gr.Blocks(title="Model Fine-Tuning Interface") as demo:
    gr.Markdown("# OPT-350M Fine-Tuning UI")
    gr.Markdown("Upload a JSON file with 'input' and 'output' pairs to fine-tune the model on your fraud dataset.")
   
    file_input = gr.File(label="Upload Fraud Dataset (JSON)")
    train_button = gr.Button("Start Fine-Tuning")
    output = gr.Textbox(label="Training Status")
   
    train_button.click(fn=train_ui_tars, inputs=file_input, outputs=output)

# Launch the app
demo.launch()