Spaces:

mohamedrasheqA
/

Deepseek-R1-FTParams

Sleeping

App Files Files Community

MRasheq commited on Jan 30

Commit

117a333

1 Parent(s): a9281cb

Fifth commit

Browse files

Files changed (1) hide show

app.py +94 -80

app.py CHANGED Viewed

@@ -26,37 +26,47 @@ def save_uploaded_file(file_obj):
     """Save uploaded file and return its path"""
     try:
         os.makedirs('uploads', exist_ok=True)
-        if hasattr(file_obj, 'name'):
-            # If it's a FileUpload object
-            file_path = os.path.join('uploads', os.path.basename(file_obj.name))
-            if isinstance(file_obj, (bytes, bytearray)):
-                with open(file_path, 'wb') as f:
-                    f.write(file_obj)
-            else:
-                file_obj.save(file_path)
         else:
-            # If it's raw bytes
-            import tempfile
-            fd, file_path = tempfile.mkstemp(suffix='.csv', dir='uploads')
-            with os.fdopen(fd, 'wb') as temp:
-                if isinstance(file_obj, (bytes, bytearray)):
-                    temp.write(file_obj)
-                else:
-                    temp.write(file_obj.read())
-        return file_path
     except Exception as e:
         raise Exception(f"Error saving file: {str(e)}")
 def prepare_training_data(df):
     """Convert DataFrame into Q&A format"""
     formatted_data = []
-    for _, row in df.iterrows():
-        # Format each conversation in the required structure
-        formatted_text = f"User: {row['chunk_id']}\nAssistant: {row['text']}"
-        formatted_data.append({"text": formatted_text})
-    return formatted_data
 def prepare_training_components(
     data_path,
@@ -66,6 +76,8 @@ def prepare_training_components(
     model_name=MODEL_NAME
 ):
     """Prepare model, tokenizer, and training arguments"""
     # Create output directory with timestamp
     import time
@@ -75,8 +87,14 @@ def prepare_training_components(
     os.makedirs(LOGS_DIR, exist_ok=True)
     # Load data and convert to Q&A format
-    df = pd.read_csv(data_path)
-    formatted_data = prepare_training_data(df)
     # Load tokenizer and model
     tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
@@ -231,63 +249,59 @@ def train_model(
 # Create Gradio interface
 def create_interface():
     # Configure Gradio to handle larger file uploads
-    demo = gr.Interface(
-        title="Model Fine-tuning Interface"
-    )
     gr.Config(upload_size_limit=100)
-    with gr.Row():
-        with gr.Column():
-            file_input = gr.File(
-                label="Upload Training Data (CSV)",
-                type="binary",
-                file_types=[".csv"]
-            )
-            learning_rate = gr.Slider(
-                minimum=1e-5,
-                maximum=1e-3,
-                value=2e-4,
-                label="Learning Rate"
-            )
-            num_epochs = gr.Slider(
-                minimum=1,
-                maximum=10,
-                value=3,
-                step=1,
-                label="Number of Epochs"
-            )
-            batch_size = gr.Slider(
-                minimum=1,
-                maximum=8,
-                value=4,
-                step=1,
-                label="Batch Size"
-            )
-            train_button = gr.Button("Start Training")
-        with gr.Column():
-            output = gr.Textbox(label="Training Status")
-    train_button.click(
-        fn=train_model,
-        inputs=[file_input, learning_rate, num_epochs, batch_size],
-        outputs=output
-    )
-    gr.Markdown("""
-    ## Instructions
-    1. Upload your training data in CSV format with columns:
-       - chunk_id (questions)
-       - text (answers)
-    2. Adjust training parameters if needed
-    3. Click 'Start Training'
-    4. Wait for training to complete
-    """)
     return demo

     """Save uploaded file and return its path"""
     try:
         os.makedirs('uploads', exist_ok=True)
+        import tempfile
+        # Create a temporary file
+        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.csv', dir='uploads')
+        # Write the content
+        if isinstance(file_obj, (bytes, bytearray)):
+            temp_file.write(file_obj)
         else:
+            content = file_obj.read()
+            if isinstance(content, str):
+                temp_file.write(content.encode('utf-8'))
+            else:
+                temp_file.write(content)
+        temp_file.close()
+        return temp_file.name
     except Exception as e:
         raise Exception(f"Error saving file: {str(e)}")
 def prepare_training_data(df):
     """Convert DataFrame into Q&A format"""
     formatted_data = []
+    try:
+        for _, row in df.iterrows():
+            # Clean and validate the data
+            chunk_id = str(row['chunk_id']).strip()
+            text = str(row['text']).strip()
+            if chunk_id and text:  # Only include non-empty pairs
+                # Format each conversation in the required structure
+                formatted_text = f"User: {chunk_id}\nAssistant: {text}"
+                formatted_data.append({"text": formatted_text})
+        if not formatted_data:
+            raise ValueError("No valid training pairs found in the data")
+        return formatted_data
+    except Exception as e:
+        raise Exception(f"Error preparing training data: {str(e)}")
 def prepare_training_components(
     data_path,
     model_name=MODEL_NAME
 ):
     """Prepare model, tokenizer, and training arguments"""
+    print(f"Loading data from: {data_path}")  # Debug logging
+    """Prepare model, tokenizer, and training arguments"""
     # Create output directory with timestamp
     import time
     os.makedirs(LOGS_DIR, exist_ok=True)
     # Load data and convert to Q&A format
+    try:
+        df = pd.read_csv(data_path, encoding='utf-8')
+        print(f"Loaded CSV with {len(df)} rows")  # Debug logging
+        formatted_data = prepare_training_data(df)
+        print(f"Prepared {len(formatted_data)} training examples")  # Debug logging
+    except Exception as e:
+        print(f"Error loading CSV: {str(e)}")  # Debug logging
+        raise
     # Load tokenizer and model
     tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 # Create Gradio interface
 def create_interface():
     # Configure Gradio to handle larger file uploads
     gr.Config(upload_size_limit=100)
+        with gr.Row():
+            with gr.Column():
+                file_input = gr.File(
+                    label="Upload Training Data (CSV)",
+                    type="binary",
+                    file_types=[".csv"]
+                )
+                learning_rate = gr.Slider(
+                    minimum=1e-5,
+                    maximum=1e-3,
+                    value=2e-4,
+                    label="Learning Rate"
+                )
+                num_epochs = gr.Slider(
+                    minimum=1,
+                    maximum=10,
+                    value=3,
+                    step=1,
+                    label="Number of Epochs"
+                )
+                batch_size = gr.Slider(
+                    minimum=1,
+                    maximum=8,
+                    value=4,
+                    step=1,
+                    label="Batch Size"
+                )
+                train_button = gr.Button("Start Training")
+            with gr.Column():
+                output = gr.Textbox(label="Training Status")
+        train_button.click(
+            fn=train_model,
+            inputs=[file_input, learning_rate, num_epochs, batch_size],
+            outputs=output
+        )
+        gr.Markdown("""
+        ## Instructions
+        1. Upload your training data in CSV format with columns:
+           - chunk_id (questions)
+           - text (answers)
+        2. Adjust training parameters if needed
+        3. Click 'Start Training'
+        4. Wait for training to complete
+        """)
     return demo