Spaces:

feel-fl
/

open-human-feedback-chat

Running on Zero

App Files Files Community

Riddhi Bhagwat commited on Mar 12

Commit

430ca63

unverified ·

2 Parent(s): fbc38c9 d062581

Merge pull request #11 from jenbenarye/main

Browse files

training (lora) & dataset processing scripts

Files changed (7) hide show

.gitignore +13 -1
app/app.py +3 -0
ml/adapter_metadata.py +41 -0
ml/dataset_training.ipynb +0 -398
ml/kto.py +0 -117
ml/kto_dataset_processor.py +196 -51
ml/{kto_pipeline.py → trainer.py} +116 -53

.gitignore CHANGED Viewed

@@ -160,4 +160,16 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
-user_feedback

 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+user_feedback
+# Hugging Face cache
+wandb/
+.cache/
+cached_*
+# Hugging Face datasets
+datasets/
+# Hugging Face models
+models/

app/app.py CHANGED Viewed

@@ -386,6 +386,9 @@ css = """
 .option.svelte-pcaovb {
     display: none !important;
 }
 """
 with gr.Blocks(css=css) as demo:

 .option.svelte-pcaovb {
     display: none !important;
 }
+.retry-btn {
+    display: none !important;
+}
 """
 with gr.Blocks(css=css) as demo:

ml/adapter_metadata.py ADDED Viewed

	@@ -0,0 +1,41 @@

+from dataclasses import dataclass
+from datetime import datetime
+from typing import List, Dict
+import json
+@dataclass
+class AdapterMetadata:
+    """Metadata for tracking adapter training history"""
+    training_timestamp: str  # ISO format timestamp
+    training_params: Dict  # Training parameters used
+    model_name: str  # Base model name
+    language: str  # Language of the adapter
+    version: str  # Version of the adapter
+    # Create class instance from a dictionary
+    @classmethod
+    def from_dict(cls, data: Dict):
+        return cls(**data)
+    # Convert class instance to a dictionary
+    def to_dict(self) -> Dict:
+        return {
+            "training_timestamp": self.training_timestamp,
+            "dataset_entries": self.dataset_entries,
+            "training_params": self.training_params,
+            "model_name": self.model_name,
+            "language": self.language,
+            "version": self.version
+        }
+    # Save metadata to a JSON file
+    def save(self, filepath: str):
+        with open(filepath, 'w') as f:
+            json.dump(self.to_dict(), f, indent=2)
+    # Load metadata from a JSON file
+    @classmethod
+    def load(cls, filepath: str):
+        with open(filepath, 'r') as f:
+            data = json.load(f)
+        return cls.from_dict(data)

ml/dataset_training.ipynb DELETED Viewed

@@ -1,398 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 43,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#dependencies:\n",
-    "import pandas as pd\n",
-    "\n",
-    "import torch\n",
-    "from transformers import GPT2Tokenizer\n",
-    "\n",
-    "from trl import AutoModelForCausalLMWithValueHead, PPOConfig, PPOTrainer"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 44,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "b8a22b8d60c0417eafbf554832398287",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "b83d2624c2b14986a8297821460225ab",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "b4304c0f48cb472589b5e80d3a42cba2",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "#loading datasets:\n",
-    "from datasets import load_dataset\n",
-    "\n",
-    "ds = load_dataset(\"stanfordnlp/SHP\", split='train')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 45,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Index(['post_id', 'domain', 'upvote_ratio', 'history', 'c_root_id_A',\n",
-      "       'c_root_id_B', 'created_at_utc_A', 'created_at_utc_B', 'score_A',\n",
-      "       'score_B', 'human_ref_A', 'human_ref_B', 'labels', 'seconds_difference',\n",
-      "       'score_ratio'],\n",
-      "      dtype='object')\n"
-     ]
-    }
-   ],
-   "source": [
-    "df = ds.to_pandas()\n",
-    "print(df.columns)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 46,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>upvote_ratio</th>\n",
-       "      <th>history</th>\n",
-       "      <th>score_A</th>\n",
-       "      <th>score_B</th>\n",
-       "      <th>human_ref_A</th>\n",
-       "      <th>human_ref_B</th>\n",
-       "      <th>labels</th>\n",
-       "      <th>score_ratio</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>0.99</td>\n",
-       "      <td>In an interview right before receiving the 201...</td>\n",
-       "      <td>52</td>\n",
-       "      <td>54</td>\n",
-       "      <td>Currently wrapping up my PhD. There is a stark...</td>\n",
-       "      <td>It’s ironic to me that research has shown that...</td>\n",
-       "      <td>0</td>\n",
-       "      <td>1.038462</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>0.95</td>\n",
-       "      <td>If any professor is reading this: please do no...</td>\n",
-       "      <td>5</td>\n",
-       "      <td>17</td>\n",
-       "      <td>And when your teacher doesn't listen or pay at...</td>\n",
-       "      <td>I'm pretty strict on time, to the point where ...</td>\n",
-       "      <td>0</td>\n",
-       "      <td>3.400000</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>0.95</td>\n",
-       "      <td>If any professor is reading this: please do no...</td>\n",
-       "      <td>5</td>\n",
-       "      <td>7</td>\n",
-       "      <td>Profs can be oblivious? What’s new!</td>\n",
-       "      <td>This sounds like a problem with a specific pro...</td>\n",
-       "      <td>0</td>\n",
-       "      <td>1.400000</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>0.95</td>\n",
-       "      <td>If any professor is reading this: please do no...</td>\n",
-       "      <td>7</td>\n",
-       "      <td>5</td>\n",
-       "      <td>This sounds like a problem with a specific pro...</td>\n",
-       "      <td>And when your teacher doesn't listen or pay at...</td>\n",
-       "      <td>1</td>\n",
-       "      <td>1.400000</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>0.95</td>\n",
-       "      <td>If any professor is reading this: please do no...</td>\n",
-       "      <td>6</td>\n",
-       "      <td>7</td>\n",
-       "      <td>This would be totally unacceptable in my class...</td>\n",
-       "      <td>This sounds like a problem with a specific pro...</td>\n",
-       "      <td>0</td>\n",
-       "      <td>1.166667</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>348713</th>\n",
-       "      <td>0.94</td>\n",
-       "      <td>Can I get in trouble for giving my neighbor hi...</td>\n",
-       "      <td>7</td>\n",
-       "      <td>25</td>\n",
-       "      <td>Just put up a fence. Legally he isn't responsi...</td>\n",
-       "      <td>Whatever you do, don't cut his trees down.</td>\n",
-       "      <td>0</td>\n",
-       "      <td>3.571429</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>348714</th>\n",
-       "      <td>0.94</td>\n",
-       "      <td>Can I get in trouble for giving my neighbor hi...</td>\n",
-       "      <td>2</td>\n",
-       "      <td>25</td>\n",
-       "      <td>If OP pays someone to clean his yard, and then...</td>\n",
-       "      <td>Whatever you do, don't cut his trees down.</td>\n",
-       "      <td>0</td>\n",
-       "      <td>12.500000</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>348715</th>\n",
-       "      <td>0.94</td>\n",
-       "      <td>Can I get in trouble for giving my neighbor hi...</td>\n",
-       "      <td>9</td>\n",
-       "      <td>7</td>\n",
-       "      <td>My  observation is that both of you are idiots...</td>\n",
-       "      <td>Are you Rand Paul's neighbor?  https://www.gq....</td>\n",
-       "      <td>1</td>\n",
-       "      <td>1.285714</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>348716</th>\n",
-       "      <td>0.94</td>\n",
-       "      <td>Can I get in trouble for giving my neighbor hi...</td>\n",
-       "      <td>9</td>\n",
-       "      <td>7</td>\n",
-       "      <td>My  observation is that both of you are idiots...</td>\n",
-       "      <td>Just put up a fence. Legally he isn't responsi...</td>\n",
-       "      <td>1</td>\n",
-       "      <td>1.285714</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>348717</th>\n",
-       "      <td>0.94</td>\n",
-       "      <td>Can I get in trouble for giving my neighbor hi...</td>\n",
-       "      <td>7</td>\n",
-       "      <td>2</td>\n",
-       "      <td>Capture his acts on camera.  Collect and bag l...</td>\n",
-       "      <td>If OP pays someone to clean his yard, and then...</td>\n",
-       "      <td>1</td>\n",
-       "      <td>3.500000</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>348718 rows × 8 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "        upvote_ratio                                            history  \\\n",
-       "0               0.99  In an interview right before receiving the 201...   \n",
-       "1               0.95  If any professor is reading this: please do no...   \n",
-       "2               0.95  If any professor is reading this: please do no...   \n",
-       "3               0.95  If any professor is reading this: please do no...   \n",
-       "4               0.95  If any professor is reading this: please do no...   \n",
-       "...              ...                                                ...   \n",
-       "348713          0.94  Can I get in trouble for giving my neighbor hi...   \n",
-       "348714          0.94  Can I get in trouble for giving my neighbor hi...   \n",
-       "348715          0.94  Can I get in trouble for giving my neighbor hi...   \n",
-       "348716          0.94  Can I get in trouble for giving my neighbor hi...   \n",
-       "348717          0.94  Can I get in trouble for giving my neighbor hi...   \n",
-       "\n",
-       "        score_A  score_B                                        human_ref_A  \\\n",
-       "0            52       54  Currently wrapping up my PhD. There is a stark...   \n",
-       "1             5       17  And when your teacher doesn't listen or pay at...   \n",
-       "2             5        7                Profs can be oblivious? What’s new!   \n",
-       "3             7        5  This sounds like a problem with a specific pro...   \n",
-       "4             6        7  This would be totally unacceptable in my class...   \n",
-       "...         ...      ...                                                ...   \n",
-       "348713        7       25  Just put up a fence. Legally he isn't responsi...   \n",
-       "348714        2       25  If OP pays someone to clean his yard, and then...   \n",
-       "348715        9        7  My  observation is that both of you are idiots...   \n",
-       "348716        9        7  My  observation is that both of you are idiots...   \n",
-       "348717        7        2  Capture his acts on camera.  Collect and bag l...   \n",
-       "\n",
-       "                                              human_ref_B  labels  score_ratio  \n",
-       "0       It’s ironic to me that research has shown that...       0     1.038462  \n",
-       "1       I'm pretty strict on time, to the point where ...       0     3.400000  \n",
-       "2       This sounds like a problem with a specific pro...       0     1.400000  \n",
-       "3       And when your teacher doesn't listen or pay at...       1     1.400000  \n",
-       "4       This sounds like a problem with a specific pro...       0     1.166667  \n",
-       "...                                                   ...     ...          ...  \n",
-       "348713         Whatever you do, don't cut his trees down.       0     3.571429  \n",
-       "348714         Whatever you do, don't cut his trees down.       0    12.500000  \n",
-       "348715  Are you Rand Paul's neighbor?  https://www.gq....       1     1.285714  \n",
-       "348716  Just put up a fence. Legally he isn't responsi...       1     1.285714  \n",
-       "348717  If OP pays someone to clean his yard, and then...       1     3.500000  \n",
-       "\n",
-       "[348718 rows x 8 columns]"
-      ]
-     },
-     "execution_count": 46,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# df['response_length'] = df['history'].apply(len)\n",
-    "# df['label'] = df['response_length'].apply(lambda x: 'long' if x > 100 else 'short')\n",
-    "df.drop(columns=['post_id', 'domain', 'c_root_id_A', 'c_root_id_B', 'created_at_utc_A', 'created_at_utc_B', 'seconds_difference'])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 47,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/Users/riddhib/.pyenv/versions/3.10.13/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:1617: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be deprecated in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n",
-      "  warnings.warn(\n"
-     ]
-    }
-   ],
-   "source": [
-    "model = AutoModelForCausalLMWithValueHead.from_pretrained(\"gpt2\")\n",
-    "ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(\"gpt2\")\n",
-    "tokenizer = GPT2Tokenizer.from_pretrained(\"gpt2\")\n",
-    "tokenizer.pad_token = tokenizer.eos_token"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 48,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from trl_rlhf_data import runner, ScriptArguments\n",
-    "import re\n",
-    "from dataclasses import dataclass\n",
-    "from typing import Dict, List, Optional\n",
-    "\n",
-    "from datasets import load_dataset\n",
-    "from transformers import HfArgumentParser"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 49,
-   "metadata": {},
-   "outputs": [
-    {
-     "ename": "TypeError",
-     "evalue": "runner() takes 0 positional arguments but 1 was given",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[49], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m dataset \u001b[38;5;241m=\u001b[39m \u001b[43mrunner\u001b[49m\u001b[43m(\u001b[49m\u001b[43mScriptArguments\u001b[49m\u001b[43m)\u001b[49m\n",
-      "\u001b[0;31mTypeError\u001b[0m: runner() takes 0 positional arguments but 1 was given"
-     ]
-    }
-   ],
-   "source": [
-    "dataset = runner(ScriptArguments)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.13"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}

ml/kto.py DELETED Viewed

@@ -1,117 +0,0 @@
-# Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Run the KTO training script with the commands below. In general, the optimal configuration for KTO will be similar to that of DPO.
-# Full training:
-python examples/scripts/kto.py \
-    --dataset_name trl-lib/kto-mix-14k \
-    --model_name_or_path=trl-lib/qwen1.5-1.8b-sft \
-    --per_device_train_batch_size 16 \
-    --num_train_epochs 1 \
-    --learning_rate 5e-7 \
-    --lr_scheduler_type=cosine \
-    --gradient_accumulation_steps 1 \
-    --logging_steps 10 \
-    --eval_steps 500 \
-    --output_dir=kto-aligned-model \
-    --warmup_ratio 0.1 \
-    --report_to wandb \
-    --bf16 \
-    --logging_first_step
-# QLoRA:
-python examples/scripts/kto.py \
-    --dataset_name trl-lib/kto-mix-14k \
-    --model_name_or_path=trl-lib/qwen1.5-1.8b-sft \
-    --per_device_train_batch_size 8 \
-    --num_train_epochs 1 \
-    --learning_rate 5e-7 \
-    --lr_scheduler_type=cosine \
-    --gradient_accumulation_steps 1 \
-    --logging_steps 10 \
-    --eval_steps 500 \
-    --output_dir=kto-aligned-model-lora \
-    --warmup_ratio 0.1 \
-    --report_to wandb \
-    --bf16 \
-    --logging_first_step \
-    --use_peft \
-    --load_in_4bit \
-    --lora_target_modules=all-linear \
-    --lora_r=16 \
-    --lora_alpha=16
-"""
-from datasets import load_dataset
-from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser
-from trl import (
-    KTOConfig,
-    KTOTrainer,
-    ModelConfig,
-    ScriptArguments,
-    get_peft_config,
-    setup_chat_format,
-)
-if __name__ == "__main__":
-    parser = HfArgumentParser((ScriptArguments, KTOConfig, ModelConfig))
-    script_args, training_args, model_args = parser.parse_args_into_dataclasses()
-    # Load a pretrained model
-    model = AutoModelForCausalLM.from_pretrained(
-        model_args.model_name_or_path, trust_remote_code=model_args.trust_remote_code
-    )
-    ref_model = AutoModelForCausalLM.from_pretrained(
-        model_args.model_name_or_path, trust_remote_code=model_args.trust_remote_code
-    )
-    tokenizer = AutoTokenizer.from_pretrained(
-        model_args.model_name_or_path, trust_remote_code=model_args.trust_remote_code
-    )
-    if tokenizer.pad_token is None:
-        tokenizer.pad_token = tokenizer.eos_token
-    # If we are aligning a base model, we use ChatML as the default template
-    if tokenizer.chat_template is None:
-        model, tokenizer = setup_chat_format(model, tokenizer)
-    # Load the dataset
-    dataset = load_dataset(script_args.dataset_name, name=script_args.dataset_config)
-    # Initialize the KTO trainer
-    trainer = KTOTrainer(
-        model,
-        ref_model,
-        args=training_args,
-        train_dataset=dataset[script_args.dataset_train_split],
-        eval_dataset=(
-            dataset[script_args.dataset_test_split]
-            if training_args.eval_strategy != "no"
-            else None
-        ),
-        processing_class=tokenizer,
-        peft_config=get_peft_config(model_args),
-    )
-    # Train and push the model to the Hub
-    trainer.train()
-    # Save and push to hub
-    trainer.save_model(training_args.output_dir)
-    if training_args.push_to_hub:
-        trainer.push_to_hub(dataset_name=script_args.dataset_name)

ml/kto_dataset_processor.py CHANGED Viewed

@@ -1,65 +1,210 @@
-from datasets import load_dataset, Dataset
 import pandas as pd
-from pdb import set_trace as st
-def process_dataset_ultrafeedback():
     """
-    Processes the 'train_prefs' and 'test_prefs' splits of the 'HuggingFaceH4/ultrafeedback_binarized' dataset
-    into a unified format for preference modeling.
     Returns:
-        dict: A dictionary containing the unified 'train' and 'test' splits of the dataset in the KTO format.
-              Each split is a Hugging Face Dataset object.
     """
-    # Load the relevant splits of the dataset
-    dataset_name = "HuggingFaceH4/ultrafeedback_binarized"
-    train_prefs = load_dataset(dataset_name, split="train_prefs")
-    test_prefs = load_dataset(dataset_name, split="test_prefs")
-    # Function to transform a single example into the desired schema
-    def transform_data(example):
-        data_points = []
-        # Chosen completion
-        chosen_completion = example["chosen"][1]["content"]
-        if chosen_completion.strip():  # Check for non-empty completions
-            data_points.append({
-                "prompt": example["prompt"],
-                "completion": chosen_completion.strip(),
-                "label": True
-            })
-        # Rejected completion
-        rejected_completion = example["rejected"][1]["content"]
-        if rejected_completion.strip():  # Check for non-empty completions
-            data_points.append({
-                "prompt": example["prompt"],
-                "completion": rejected_completion.strip(),
-                "label": False
-            })
-        return data_points
-    # Process train and test splits
-    train_data = []
-    test_data = []
-    for example in train_prefs:
-        train_data.extend(transform_data(example))
-    for example in test_prefs:
-        test_data.extend(transform_data(example))
-    # Convert unified data to DataFrames
-    train_df = pd.DataFrame(train_data)
-    test_df = pd.DataFrame(test_data)
     # Convert to Hugging Face Dataset
-    unified_train = Dataset.from_pandas(train_df)
-    unified_test = Dataset.from_pandas(test_df)
-    return {"train": unified_train, "test": unified_test}
 if __name__ == "__main__":
-    kto_dataset = process_dataset_ultrafeedback()
-    st()

+from datasets import Dataset, load_dataset
 import pandas as pd
+from sklearn.model_selection import train_test_split
+import json
+from ipdb import set_trace as st
+from transformers import AutoTokenizer
+from enum import Enum
+class SupportedLanguages(str, Enum):
+    """Enumeration of supported languages"""
+    ENGLISH = "English"
+    DUTCH = "Dutch"
+    ITALIAN = "Italian"
+    SPANISH = "Spanish"
+    FRENCH = "French"
+    GERMAN = "German"
+    PORTUGUESE = "Portuguese"
+    RUSSIAN = "Russian"
+    CHINESE = "Chinese"
+    JAPANESE = "Japanese"
+    KOREAN = "Korean"
+def transform_conversation(
+    entry: dict,
+    model_name: str,
+    max_history_turns: int = 10,
+    max_history_tokens: int = 4000
+) -> list:
+    """Transform conversation into KTO format with history"""
+    data_points = []
+    conversation = entry["conversation"]
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    for i, message in enumerate(conversation):
+        # Only create data points for assistant messages that have ratings
+        if message["role"] != "assistant" or message["rating"] not in [1, -1]:
+            continue
+        # Get previous messages up to limits
+        formatted_history = []
+        formatted_prompt = ""
+        tokens = 0
+        pairs = 0  # Count complete user/assistant pairs
+        # Start from the current message and work backwards
+        current_idx = i - 1
+        while current_idx >= 0 and pairs < max_history_turns:
+            # We need both user and assistant messages to form a pair
+            if current_idx > 0 and conversation[current_idx]["role"] == "assistant" and conversation[current_idx-1]["role"] == "user":
+                # Add the pair to history
+                formatted_history.insert(0, conversation[current_idx-1])  # user
+                formatted_history.insert(1, conversation[current_idx])    # assistant
+                # Check token limit
+                try:
+                    current_formatted = tokenizer.apply_chat_template(formatted_history, tokenize=False)
+                    current_tokens = len(tokenizer.encode(current_formatted))
+                    if current_tokens > max_history_tokens:
+                        formatted_history = formatted_history[2:]  # Remove the oldest pair
+                        break
+                    formatted_prompt = current_formatted
+                    tokens = current_tokens
+                    pairs += 1
+                    current_idx -= 2
+                except Exception:
+                    # If template application fails, remove the last added pair
+                    formatted_history = formatted_history[2:]
+                    break
+            else:
+                current_idx -= 1
+        # Add the final user message that prompted the rated response
+        if i > 0 and conversation[i-1]["role"] == "user":
+            last_history = formatted_history + [conversation[i-1]]
+            try:
+                formatted_prompt = tokenizer.apply_chat_template(last_history, tokenize=False)
+            except Exception:
+                # If template application fails, use the previous valid prompt
+                pass
+        data_points.append({
+            "prompt": formatted_prompt.strip(),
+            "completion": message["content"].strip(),
+            "label": message["rating"] == 1,
+            "timestamp": entry["timestamp"],
+            "session_id": entry["session_id"],
+            "conversation_id": entry["conversation_id"],
+            "language": entry["language"]
+        })
+    return data_points
+def process_feel_dataset(
+    language: str,
+    model_name: str = "CohereForAI/aya-expanse-8b",
+    max_history_turns: int = 10,
+    max_history_tokens: int = 4000
+):
     """
+    Processes the feel dataset into a format suitable for KTO training using TRL.
+    Args:
+        language: Language to filter the dataset for (must be one of SupportedLanguages)
+        model_name: Name of the model to format for
+        max_history_turns: Maximum number of previous turns to include in history
+        max_history_tokens: Maximum number of tokens allowed in history
     Returns:
+        dict: A dictionary containing the 'train' and 'test' splits of the dataset in KTO format
+    Raises:
+        ValueError: If language is not provided or not in SupportedLanguages
     """
+    # Validate language
+    if not language:
+        raise ValueError("Language parameter is required")
+    try:
+        # Validate that it's a supported language
+        SupportedLanguages(language)
+    except ValueError:
+        supported_langs = "\n- ".join([lang.value for lang in SupportedLanguages])
+        raise ValueError(
+            f"Invalid language: '{language}'\n"
+            f"Supported languages are:\n- {supported_langs}"
+        )
+    # Load feel dataset from HuggingFace
+    feel_dataset = load_dataset("feel-fl/feel-feedback")["train"]
+    # Filter dataset by language
+    feel_dataset = feel_dataset.filter(lambda x: x["language"] == language)
+    if len(feel_dataset) == 0:
+        raise ValueError(f"No data found for language: {language}")
+    kto_data = []
+    # Process all conversations in the filtered dataset
+    for entry in feel_dataset:
+        kto_data.extend(transform_conversation(
+            entry,
+            model_name,
+            max_history_turns,
+            max_history_tokens
+        ))
+    if len(kto_data) == 0:
+        raise ValueError(f"No valid training examples found for language: {language}")
+    # Convert to DataFrame
+    kto_df = pd.DataFrame(kto_data)
+    # Split into train and test sets (70% train, 30% test)
+    train_df, test_df = train_test_split(kto_df, test_size=0.3, random_state=42)
+    # Reset index to remove '__index_level_0__'
+    train_df = train_df.reset_index(drop=True)
+    test_df = test_df.reset_index(drop=True)
     # Convert to Hugging Face Dataset
+    train_dataset = Dataset.from_pandas(train_df)
+    test_dataset = Dataset.from_pandas(test_df)
+    print(f"Processed {len(kto_data)} examples for language: {language}")
+    print(f"Train set size: {len(train_dataset)}")
+    print(f"Test set size: {len(test_dataset)}")
+    return {"train": train_dataset, "test": test_dataset}
 if __name__ == "__main__":
+    # Process the dataset
+    datasets = process_feel_dataset("English")
+    # Print distribution of positive/negative labels
+    train_labels = datasets['train']['label']
+    test_labels = datasets['test']['label']
+    print("\nLabel Distribution:")
+    print("Train set:")
+    print(f"Positive feedback: {sum(train_labels)}")
+    print(f"Negative feedback: {len(train_labels) - sum(train_labels)}")
+    print(f"Positive ratio: {sum(train_labels)/len(train_labels):.2%}")
+    print("\nTest set:")
+    print(f"Positive feedback: {sum(test_labels)}")
+    print(f"Negative feedback: {len(test_labels) - sum(test_labels)}")
+    print(f"Positive ratio: {sum(test_labels)/len(test_labels):.2%}")
+    # Load original FEEL dataset
+    feel_dataset = load_dataset("feel-fl/feel-feedback", split="train")
+    # Print one original conversation
+    print("\nOriginal conversation from FEEL dataset:")
+    print(json.dumps(feel_dataset[0], indent=2))
+    # Print sample entries from processed dataset
+    print("\nSample entries from processed KTO dataset:")
+    print("\n" + "="*80 + "\nTRAIN SET SAMPLES\n" + "="*80)
+    # Export datasets to CSV
+    train_df = datasets['train'].to_pandas()
+    test_df = datasets['test'].to_pandas()
+    train_df.to_csv('kto_train_dataset.csv', index=False)
+    test_df.to_csv('kto_test_dataset.csv', index=False)
+    print("\nDatasets exported to 'kto_train_dataset.csv' and 'kto_test_dataset.csv'")

ml/{kto_pipeline.py → trainer.py} RENAMED Viewed

@@ -1,35 +1,58 @@
 import torch
 from dataclasses import dataclass
 from accelerate import PartialState
 from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser
 from trl import KTOConfig, KTOTrainer, ModelConfig, get_peft_config, maybe_unpair_preference_dataset, setup_chat_format
-from kto_dataset_processor import process_dataset_ultrafeedback
 from datetime import datetime
 import wandb
 ####################################
 #  CONFIGURATION
 ####################################
 @dataclass
 class ScriptArguments:
     """
     Configuration for the script.
     """
-    process_dataset_func: callable = process_dataset_ultrafeedback  # process_dataset function from kto_dataset_processor.py
-    checkpoint_path: str = None  # Checkpoint path
-    push_to_hub: bool = False  # Whether to push the model to the Hugging Face hub
 @dataclass
 class ModelArguments(ModelConfig):
     """
     Configuration for the model.
     """
-    model_name: str = "HuggingFaceH4/zephyr-7b-beta"
     use_peft: bool = True
     lora_target_modules: str = "all-linear"
     lora_r: int = 16
     lora_alpha: int = 16
 @dataclass
 class TrainingArguments(KTOConfig):
@@ -38,7 +61,7 @@ class TrainingArguments(KTOConfig):
     """
     output_dir: str = f"kto_{ModelArguments.model_name}_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"
     num_train_epochs: int = 1
-    per_device_train_batch_size: int = 4  # Highest that runs well
     learning_rate: float = 5e-7
     lr_scheduler_type: str = "cosine"
     gradient_accumulation_steps: int = 1
@@ -48,8 +71,6 @@ class TrainingArguments(KTOConfig):
     bf16: bool = True
     logging_first_step: bool = True
 # Initialize configurations
 script_args = ScriptArguments()
 training_args = TrainingArguments()
@@ -61,7 +82,7 @@ model_args = ModelArguments()
 def load_model_and_tokenizer(model_args):
     """
-    Load a model and tokenizer from a specified path.
     """
     model = AutoModelForCausalLM.from_pretrained(
         model_args.model_name,
@@ -74,74 +95,97 @@ def load_model_and_tokenizer(model_args):
         trust_remote_code=model_args.trust_remote_code
     )
-    # Set pad token if missing
     if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token
-    # Setup chat format if not present
-    if tokenizer.chat_template is None:
         model, tokenizer = setup_chat_format(model, tokenizer)
-    return model, tokenizer
-# def find_unknown_tokens(tokenizer, texts):
-#     """
-#     Identify tokens in the dataset that are not in the tokenizer's vocabulary.
-#     """
-#     all_tokens = set()
-#     for text in texts:
-#         tokens = tokenizer.tokenize(text)
-#         all_tokens.update(tokens)
-#     vocab = set(tokenizer.get_vocab().keys())
-#     unknown_tokens = all_tokens - vocab
-#     return unknown_tokens
-# def add_tokens_to_tokenizer(tokenizer, model, dataset):
-#     """
-#     Extend the tokenizer's vocabulary with missing tokens and resize the model embeddings.
-#     """
-#     # Extract all texts from the dataset
-#     texts = [example["completion"] for example in dataset["train"]]
-#     # Identify unknown tokens
-#     unknown_tokens = find_unknown_tokens(tokenizer, texts)
-#     print(f"Found {len(unknown_tokens)} unknown tokens: {list(unknown_tokens)[:10]}...")
-#     # Add unknown tokens to tokenizer
-#     tokenizer.add_tokens(list(unknown_tokens))
-#     model.resize_token_embeddings(len(tokenizer))
-#     print(f"Tokenizer vocabulary size after extension: {len(tokenizer)}")
 ####################################
 #  MAIN LOGIC
 ####################################
 def main():
-    # Initialize wandb
     wandb.init(project="kto")
-    # Load models and tokenizer
-    print("Loading models and tokenizer...")
     model, tokenizer = load_model_and_tokenizer(model_args)
     ref_model, _ = load_model_and_tokenizer(model_args)
     print("Models and tokenizer loaded.")
-    # Load and process datasets using external function
     print("Processing dataset...")
-    dataset = process_dataset_ultrafeedback()
     print("Dataset processed.")
-    # # Extend tokenizer with missing tokens
-    # print("Adding unknown tokens to tokenizer...")
-    # add_tokens_to_tokenizer(tokenizer, model, dataset)
-    # print("Tokenizer updated.")
-    # Initialize trainer
     print("Initializing trainer...")
     trainer = KTOTrainer(
         model=model,
@@ -149,8 +193,8 @@ def main():
         args=training_args,
         train_dataset=dataset["train"],
         eval_dataset=dataset["test"],
-        tokenizer=tokenizer,
-        peft_config=get_peft_config(model_args),
     )
     # Training
@@ -182,10 +226,29 @@ def main():
         "step": metrics.get("step")
     })
-    # Save model and optionally push to hub
-    trainer.save_model(training_args.output_dir)
     if script_args.push_to_hub:
-        trainer.push_to_hub()
     print("Process completed.")

+import os
 import torch
 from dataclasses import dataclass
 from accelerate import PartialState
 from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser
 from trl import KTOConfig, KTOTrainer, ModelConfig, get_peft_config, maybe_unpair_preference_dataset, setup_chat_format
+from kto_dataset_processor import process_feel_dataset, SupportedLanguages
 from datetime import datetime
 import wandb
+from enum import Enum
+from typing import Optional
+from pathlib import Path
+# PEFT library: attach and load adapters
+from peft import get_peft_model, PeftModel
 ####################################
 #  CONFIGURATION
 ####################################
 @dataclass
 class ScriptArguments:
     """
     Configuration for the script.
     """
+    process_dataset_func: callable = process_feel_dataset
+    checkpoint_path: str = None
+    push_to_hub: bool = True
+    language: str = "English"  # Default to English
+    def __post_init__(self):
+        """Validate the language after initialization"""
+        try:
+            # This will raise ValueError if language is not in the enum
+            SupportedLanguages(self.language)
+        except ValueError:
+            supported_langs = "\n- ".join([lang.value for lang in SupportedLanguages])
+            raise ValueError(
+                f"Invalid language: '{self.language}'\n"
+                f"Supported languages are:\n- {supported_langs}"
+            )
 @dataclass
 class ModelArguments(ModelConfig):
     """
     Configuration for the model.
     """
+    model_name: str = "CohereForAI/aya-expanse-8b"
     use_peft: bool = True
     lora_target_modules: str = "all-linear"
     lora_r: int = 16
     lora_alpha: int = 16
+    trust_remote_code: bool = True
 @dataclass
 class TrainingArguments(KTOConfig):
     """
     output_dir: str = f"kto_{ModelArguments.model_name}_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"
     num_train_epochs: int = 1
+    per_device_train_batch_size: int = 4
     learning_rate: float = 5e-7
     lr_scheduler_type: str = "cosine"
     gradient_accumulation_steps: int = 1
     bf16: bool = True
     logging_first_step: bool = True
 # Initialize configurations
 script_args = ScriptArguments()
 training_args = TrainingArguments()
 def load_model_and_tokenizer(model_args):
     """
+    Load the base model and tokenizer from the Hugging Face Hub.
     """
     model = AutoModelForCausalLM.from_pretrained(
         model_args.model_name,
         trust_remote_code=model_args.trust_remote_code
     )
+    # Set pad token if it is missing
     if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token
+    # Setup chat format if not available on the tokenizer
+    if not getattr(tokenizer, "chat_template", None):
         model, tokenizer = setup_chat_format(model, tokenizer)
+    return model, tokenizer
+def get_adapter_path(model_name: str, language: str, timestamp: str = None) -> Path:
+    """
+    Generate standardized adapter path.
+    If timestamp is None, returns the base language directory.
+    Otherwise, returns specific adapter version path.
+    Format: adapters/{model_name}/{language}/version_{timestamp}
+    """
+    # Clean model name (remove slashes, etc.)
+    clean_model_name = model_name.replace('/', '_')
+    base_path = Path("adapters") / clean_model_name / language
+    if timestamp:
+        return base_path / f"version_{timestamp}"
+    return base_path
+def load_latest_adapter(model, model_name: str, language: str) -> tuple[PeftModel, str]:
+    """
+    Load the most recent adapter for given model and language.
+    Returns: (loaded_model, timestamp of loaded adapter)
+    """
+    adapter_base = get_adapter_path(model_name, language)
+    if not adapter_base.exists():
+        return None, None
+    # Get all version directories and sort by timestamp
+    versions = sorted(
+        [d for d in adapter_base.glob("version_*")],
+        key=lambda x: x.name,
+        reverse=True
+    )
+    if not versions:
+        return None, None
+    latest_version = versions[0]
+    timestamp = latest_version.name.replace("version_", "")
+    model = PeftModel.from_pretrained(model, latest_version, is_trainable=True)
+    return model, timestamp
 ####################################
 #  MAIN LOGIC
 ####################################
 def main():
+    # Initialize wandb for logging
     wandb.init(project="kto")
+    # Get timestamp at start of training
+    training_timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
+    print("Loading base model and tokenizer...")
     model, tokenizer = load_model_and_tokenizer(model_args)
     ref_model, _ = load_model_and_tokenizer(model_args)
     print("Models and tokenizer loaded.")
+    # Load existing adapter or create new one
+    loaded_model, previous_timestamp = load_latest_adapter(
+        model,
+        model_args.model_name,
+        script_args.language
+    )
+    if loaded_model is not None:
+        model = loaded_model
+        print(f"Loaded existing adapter trained at {previous_timestamp}")
+    else:
+        # Initialize new LoRA adapter
+        peft_config = get_peft_config(model_args)
+        model = get_peft_model(model, peft_config)
+        print("Initialized new adapter")
+    # -----------------------------
+    # Data Preparation and Training
+    # -----------------------------
     print("Processing dataset...")
+    dataset = script_args.process_dataset_func(script_args.language)
     print("Dataset processed.")
     print("Initializing trainer...")
     trainer = KTOTrainer(
         model=model,
         args=training_args,
         train_dataset=dataset["train"],
         eval_dataset=dataset["test"],
+        processing_class=tokenizer,
+        peft_config=peft_config,
     )
     # Training
         "step": metrics.get("step")
     })
+    # Save the adapter
+    adapter_path = get_adapter_path(
+        model_args.model_name,
+        script_args.language,
+        training_timestamp
+    )
+    adapter_path.parent.mkdir(parents=True, exist_ok=True)
+    print(f"Saving adapter to: {adapter_path}")
+    model.save_pretrained(adapter_path)
+    # Save metadata
+    metadata = AdapterMetadata(
+        training_timestamp=training_timestamp,
+        model_name=model_args.model_name,
+        language=script_args.language,
+    )
+    metadata.save(adapter_path / "metadata.json")
     if script_args.push_to_hub:
+        repo_id = f"feel-fl/adapters/{model_args.model_name.replace('/', '_')}/{script_args.language}"
+        print(f"Pushing adapter to Hugging Face Hub at {repo_id}...")
+        model.push_to_hub(repo_id=repo_id)
     print("Process completed.")