MRasheq commited on
Commit
24cf4d3
·
1 Parent(s): d675e5b

Second Commit

Browse files
Files changed (1) hide show
  1. app.py +20 -14
app.py CHANGED
@@ -2,6 +2,7 @@ import os
2
  import json
3
  import gradio as gr
4
  import torch
 
5
  from transformers import (
6
  TrainingArguments,
7
  Trainer,
@@ -29,6 +30,15 @@ def save_uploaded_file(file):
29
  f.write(file.read())
30
  return file_path
31
 
 
 
 
 
 
 
 
 
 
32
  def prepare_training_components(
33
  data_path,
34
  learning_rate,
@@ -45,6 +55,10 @@ def prepare_training_components(
45
  os.makedirs(specific_output_dir, exist_ok=True)
46
  os.makedirs(LOGS_DIR, exist_ok=True)
47
 
 
 
 
 
48
  # Load tokenizer and model
49
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
50
  model = AutoModelForCausalLM.from_pretrained(
@@ -87,13 +101,9 @@ def prepare_training_components(
87
  save_total_limit=2,
88
  )
89
 
90
- # Load and prepare dataset
91
- with open(data_path, 'r') as f:
92
- raw_data = json.load(f)
93
-
94
  # Convert to datasets format
95
  dataset = Dataset.from_dict({
96
- 'text': [item['text'] for item in raw_data]
97
  })
98
 
99
  # Create data collator
@@ -164,9 +174,9 @@ def create_interface():
164
  with gr.Row():
165
  with gr.Column():
166
  file_input = gr.File(
167
- label="Upload Training Data (JSON)",
168
  type="binary",
169
- file_types=[".json"]
170
  )
171
 
172
  learning_rate = gr.Slider(
@@ -205,13 +215,9 @@ def create_interface():
205
 
206
  gr.Markdown("""
207
  ## Instructions
208
- 1. Upload your training data in JSON format:
209
- ```json
210
- [
211
- {"text": "User: Question\nAssistant: Answer"},
212
- {"text": "User: Another question\nAssistant: Another answer"}
213
- ]
214
- ```
215
  2. Adjust training parameters if needed
216
  3. Click 'Start Training'
217
  4. Wait for training to complete
 
2
  import json
3
  import gradio as gr
4
  import torch
5
+ import pandas as pd
6
  from transformers import (
7
  TrainingArguments,
8
  Trainer,
 
30
  f.write(file.read())
31
  return file_path
32
 
33
+ def prepare_training_data(df):
34
+ """Convert DataFrame into Q&A format"""
35
+ formatted_data = []
36
+ for _, row in df.iterrows():
37
+ # Format each conversation in the required structure
38
+ formatted_text = f"User: {row['chunk_id']}\nAssistant: {row['text']}"
39
+ formatted_data.append({"text": formatted_text})
40
+ return formatted_data
41
+
42
  def prepare_training_components(
43
  data_path,
44
  learning_rate,
 
55
  os.makedirs(specific_output_dir, exist_ok=True)
56
  os.makedirs(LOGS_DIR, exist_ok=True)
57
 
58
+ # Load data and convert to Q&A format
59
+ df = pd.read_csv(data_path)
60
+ formatted_data = prepare_training_data(df)
61
+
62
  # Load tokenizer and model
63
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
64
  model = AutoModelForCausalLM.from_pretrained(
 
101
  save_total_limit=2,
102
  )
103
 
 
 
 
 
104
  # Convert to datasets format
105
  dataset = Dataset.from_dict({
106
+ 'text': [item['text'] for item in formatted_data]
107
  })
108
 
109
  # Create data collator
 
174
  with gr.Row():
175
  with gr.Column():
176
  file_input = gr.File(
177
+ label="Upload Training Data (CSV)",
178
  type="binary",
179
+ file_types=[".csv"]
180
  )
181
 
182
  learning_rate = gr.Slider(
 
215
 
216
  gr.Markdown("""
217
  ## Instructions
218
+ 1. Upload your training data in CSV format with columns:
219
+ - chunk_id (questions)
220
+ - text (answers)
 
 
 
 
221
  2. Adjust training parameters if needed
222
  3. Click 'Start Training'
223
  4. Wait for training to complete