Spaces:
Sleeping
Sleeping
Second Commit
Browse files
app.py
CHANGED
@@ -2,6 +2,7 @@ import os
|
|
2 |
import json
|
3 |
import gradio as gr
|
4 |
import torch
|
|
|
5 |
from transformers import (
|
6 |
TrainingArguments,
|
7 |
Trainer,
|
@@ -29,6 +30,15 @@ def save_uploaded_file(file):
|
|
29 |
f.write(file.read())
|
30 |
return file_path
|
31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
def prepare_training_components(
|
33 |
data_path,
|
34 |
learning_rate,
|
@@ -45,6 +55,10 @@ def prepare_training_components(
|
|
45 |
os.makedirs(specific_output_dir, exist_ok=True)
|
46 |
os.makedirs(LOGS_DIR, exist_ok=True)
|
47 |
|
|
|
|
|
|
|
|
|
48 |
# Load tokenizer and model
|
49 |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
50 |
model = AutoModelForCausalLM.from_pretrained(
|
@@ -87,13 +101,9 @@ def prepare_training_components(
|
|
87 |
save_total_limit=2,
|
88 |
)
|
89 |
|
90 |
-
# Load and prepare dataset
|
91 |
-
with open(data_path, 'r') as f:
|
92 |
-
raw_data = json.load(f)
|
93 |
-
|
94 |
# Convert to datasets format
|
95 |
dataset = Dataset.from_dict({
|
96 |
-
'text': [item['text'] for item in
|
97 |
})
|
98 |
|
99 |
# Create data collator
|
@@ -164,9 +174,9 @@ def create_interface():
|
|
164 |
with gr.Row():
|
165 |
with gr.Column():
|
166 |
file_input = gr.File(
|
167 |
-
label="Upload Training Data (
|
168 |
type="binary",
|
169 |
-
file_types=[".
|
170 |
)
|
171 |
|
172 |
learning_rate = gr.Slider(
|
@@ -205,13 +215,9 @@ def create_interface():
|
|
205 |
|
206 |
gr.Markdown("""
|
207 |
## Instructions
|
208 |
-
1. Upload your training data in
|
209 |
-
|
210 |
-
|
211 |
-
{"text": "User: Question\nAssistant: Answer"},
|
212 |
-
{"text": "User: Another question\nAssistant: Another answer"}
|
213 |
-
]
|
214 |
-
```
|
215 |
2. Adjust training parameters if needed
|
216 |
3. Click 'Start Training'
|
217 |
4. Wait for training to complete
|
|
|
2 |
import json
|
3 |
import gradio as gr
|
4 |
import torch
|
5 |
+
import pandas as pd
|
6 |
from transformers import (
|
7 |
TrainingArguments,
|
8 |
Trainer,
|
|
|
30 |
f.write(file.read())
|
31 |
return file_path
|
32 |
|
33 |
+
def prepare_training_data(df):
|
34 |
+
"""Convert DataFrame into Q&A format"""
|
35 |
+
formatted_data = []
|
36 |
+
for _, row in df.iterrows():
|
37 |
+
# Format each conversation in the required structure
|
38 |
+
formatted_text = f"User: {row['chunk_id']}\nAssistant: {row['text']}"
|
39 |
+
formatted_data.append({"text": formatted_text})
|
40 |
+
return formatted_data
|
41 |
+
|
42 |
def prepare_training_components(
|
43 |
data_path,
|
44 |
learning_rate,
|
|
|
55 |
os.makedirs(specific_output_dir, exist_ok=True)
|
56 |
os.makedirs(LOGS_DIR, exist_ok=True)
|
57 |
|
58 |
+
# Load data and convert to Q&A format
|
59 |
+
df = pd.read_csv(data_path)
|
60 |
+
formatted_data = prepare_training_data(df)
|
61 |
+
|
62 |
# Load tokenizer and model
|
63 |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
64 |
model = AutoModelForCausalLM.from_pretrained(
|
|
|
101 |
save_total_limit=2,
|
102 |
)
|
103 |
|
|
|
|
|
|
|
|
|
104 |
# Convert to datasets format
|
105 |
dataset = Dataset.from_dict({
|
106 |
+
'text': [item['text'] for item in formatted_data]
|
107 |
})
|
108 |
|
109 |
# Create data collator
|
|
|
174 |
with gr.Row():
|
175 |
with gr.Column():
|
176 |
file_input = gr.File(
|
177 |
+
label="Upload Training Data (CSV)",
|
178 |
type="binary",
|
179 |
+
file_types=[".csv"]
|
180 |
)
|
181 |
|
182 |
learning_rate = gr.Slider(
|
|
|
215 |
|
216 |
gr.Markdown("""
|
217 |
## Instructions
|
218 |
+
1. Upload your training data in CSV format with columns:
|
219 |
+
- chunk_id (questions)
|
220 |
+
- text (answers)
|
|
|
|
|
|
|
|
|
221 |
2. Adjust training parameters if needed
|
222 |
3. Click 'Start Training'
|
223 |
4. Wait for training to complete
|