MRasheq commited on
Commit
117a333
·
1 Parent(s): a9281cb

Fifth commit

Browse files
Files changed (1) hide show
  1. app.py +94 -80
app.py CHANGED
@@ -26,37 +26,47 @@ def save_uploaded_file(file_obj):
26
  """Save uploaded file and return its path"""
27
  try:
28
  os.makedirs('uploads', exist_ok=True)
 
29
 
30
- if hasattr(file_obj, 'name'):
31
- # If it's a FileUpload object
32
- file_path = os.path.join('uploads', os.path.basename(file_obj.name))
33
- if isinstance(file_obj, (bytes, bytearray)):
34
- with open(file_path, 'wb') as f:
35
- f.write(file_obj)
36
- else:
37
- file_obj.save(file_path)
38
  else:
39
- # If it's raw bytes
40
- import tempfile
41
- fd, file_path = tempfile.mkstemp(suffix='.csv', dir='uploads')
42
- with os.fdopen(fd, 'wb') as temp:
43
- if isinstance(file_obj, (bytes, bytearray)):
44
- temp.write(file_obj)
45
- else:
46
- temp.write(file_obj.read())
47
 
48
- return file_path
49
  except Exception as e:
50
  raise Exception(f"Error saving file: {str(e)}")
51
 
52
  def prepare_training_data(df):
53
  """Convert DataFrame into Q&A format"""
54
  formatted_data = []
55
- for _, row in df.iterrows():
56
- # Format each conversation in the required structure
57
- formatted_text = f"User: {row['chunk_id']}\nAssistant: {row['text']}"
58
- formatted_data.append({"text": formatted_text})
59
- return formatted_data
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
  def prepare_training_components(
62
  data_path,
@@ -66,6 +76,8 @@ def prepare_training_components(
66
  model_name=MODEL_NAME
67
  ):
68
  """Prepare model, tokenizer, and training arguments"""
 
 
69
 
70
  # Create output directory with timestamp
71
  import time
@@ -75,8 +87,14 @@ def prepare_training_components(
75
  os.makedirs(LOGS_DIR, exist_ok=True)
76
 
77
  # Load data and convert to Q&A format
78
- df = pd.read_csv(data_path)
79
- formatted_data = prepare_training_data(df)
 
 
 
 
 
 
80
 
81
  # Load tokenizer and model
82
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
@@ -231,63 +249,59 @@ def train_model(
231
  # Create Gradio interface
232
  def create_interface():
233
  # Configure Gradio to handle larger file uploads
234
- demo = gr.Interface(
235
- title="Model Fine-tuning Interface"
236
- )
237
-
238
  gr.Config(upload_size_limit=100)
239
-
240
- with gr.Row():
241
- with gr.Column():
242
- file_input = gr.File(
243
- label="Upload Training Data (CSV)",
244
- type="binary",
245
- file_types=[".csv"]
246
- )
247
-
248
- learning_rate = gr.Slider(
249
- minimum=1e-5,
250
- maximum=1e-3,
251
- value=2e-4,
252
- label="Learning Rate"
253
- )
254
-
255
- num_epochs = gr.Slider(
256
- minimum=1,
257
- maximum=10,
258
- value=3,
259
- step=1,
260
- label="Number of Epochs"
261
- )
262
-
263
- batch_size = gr.Slider(
264
- minimum=1,
265
- maximum=8,
266
- value=4,
267
- step=1,
268
- label="Batch Size"
269
- )
270
-
271
- train_button = gr.Button("Start Training")
272
-
273
- with gr.Column():
274
- output = gr.Textbox(label="Training Status")
275
-
276
- train_button.click(
277
- fn=train_model,
278
- inputs=[file_input, learning_rate, num_epochs, batch_size],
279
- outputs=output
280
- )
281
-
282
- gr.Markdown("""
283
- ## Instructions
284
- 1. Upload your training data in CSV format with columns:
285
- - chunk_id (questions)
286
- - text (answers)
287
- 2. Adjust training parameters if needed
288
- 3. Click 'Start Training'
289
- 4. Wait for training to complete
290
- """)
291
 
292
  return demo
293
 
 
26
  """Save uploaded file and return its path"""
27
  try:
28
  os.makedirs('uploads', exist_ok=True)
29
+ import tempfile
30
 
31
+ # Create a temporary file
32
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.csv', dir='uploads')
33
+
34
+ # Write the content
35
+ if isinstance(file_obj, (bytes, bytearray)):
36
+ temp_file.write(file_obj)
 
 
37
  else:
38
+ content = file_obj.read()
39
+ if isinstance(content, str):
40
+ temp_file.write(content.encode('utf-8'))
41
+ else:
42
+ temp_file.write(content)
43
+
44
+ temp_file.close()
45
+ return temp_file.name
46
 
 
47
  except Exception as e:
48
  raise Exception(f"Error saving file: {str(e)}")
49
 
50
  def prepare_training_data(df):
51
  """Convert DataFrame into Q&A format"""
52
  formatted_data = []
53
+ try:
54
+ for _, row in df.iterrows():
55
+ # Clean and validate the data
56
+ chunk_id = str(row['chunk_id']).strip()
57
+ text = str(row['text']).strip()
58
+
59
+ if chunk_id and text: # Only include non-empty pairs
60
+ # Format each conversation in the required structure
61
+ formatted_text = f"User: {chunk_id}\nAssistant: {text}"
62
+ formatted_data.append({"text": formatted_text})
63
+
64
+ if not formatted_data:
65
+ raise ValueError("No valid training pairs found in the data")
66
+
67
+ return formatted_data
68
+ except Exception as e:
69
+ raise Exception(f"Error preparing training data: {str(e)}")
70
 
71
  def prepare_training_components(
72
  data_path,
 
76
  model_name=MODEL_NAME
77
  ):
78
  """Prepare model, tokenizer, and training arguments"""
79
+ print(f"Loading data from: {data_path}") # Debug logging
80
+ """Prepare model, tokenizer, and training arguments"""
81
 
82
  # Create output directory with timestamp
83
  import time
 
87
  os.makedirs(LOGS_DIR, exist_ok=True)
88
 
89
  # Load data and convert to Q&A format
90
+ try:
91
+ df = pd.read_csv(data_path, encoding='utf-8')
92
+ print(f"Loaded CSV with {len(df)} rows") # Debug logging
93
+ formatted_data = prepare_training_data(df)
94
+ print(f"Prepared {len(formatted_data)} training examples") # Debug logging
95
+ except Exception as e:
96
+ print(f"Error loading CSV: {str(e)}") # Debug logging
97
+ raise
98
 
99
  # Load tokenizer and model
100
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 
249
  # Create Gradio interface
250
  def create_interface():
251
  # Configure Gradio to handle larger file uploads
 
 
 
 
252
  gr.Config(upload_size_limit=100)
253
+
254
+ with gr.Row():
255
+ with gr.Column():
256
+ file_input = gr.File(
257
+ label="Upload Training Data (CSV)",
258
+ type="binary",
259
+ file_types=[".csv"]
260
+ )
261
+
262
+ learning_rate = gr.Slider(
263
+ minimum=1e-5,
264
+ maximum=1e-3,
265
+ value=2e-4,
266
+ label="Learning Rate"
267
+ )
268
+
269
+ num_epochs = gr.Slider(
270
+ minimum=1,
271
+ maximum=10,
272
+ value=3,
273
+ step=1,
274
+ label="Number of Epochs"
275
+ )
276
+
277
+ batch_size = gr.Slider(
278
+ minimum=1,
279
+ maximum=8,
280
+ value=4,
281
+ step=1,
282
+ label="Batch Size"
283
+ )
284
+
285
+ train_button = gr.Button("Start Training")
286
+
287
+ with gr.Column():
288
+ output = gr.Textbox(label="Training Status")
289
+
290
+ train_button.click(
291
+ fn=train_model,
292
+ inputs=[file_input, learning_rate, num_epochs, batch_size],
293
+ outputs=output
294
+ )
295
+
296
+ gr.Markdown("""
297
+ ## Instructions
298
+ 1. Upload your training data in CSV format with columns:
299
+ - chunk_id (questions)
300
+ - text (answers)
301
+ 2. Adjust training parameters if needed
302
+ 3. Click 'Start Training'
303
+ 4. Wait for training to complete
304
+ """)
305
 
306
  return demo
307