Upload 3 files

Browse files

Files changed (3) hide show

Hatespeech_Offensive_Classification_llama3.2-3B-instruct.ipynb +0 -0
Hatespeech_Offensive_Classification_testmodels.ipynb +280 -0
run.ipynb +127 -0

Hatespeech_Offensive_Classification_llama3.2-3B-instruct.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

Hatespeech_Offensive_Classification_testmodels.ipynb ADDED Viewed

	@@ -0,0 +1,280 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "93f5db97-0d94-4464-9891-0ebfe519d534",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#!pip install -U bitsandbytes\n",
+    "#!pip install -U transformers\n",
+    "#!pip install -U accelerate\n",
+    "#!pip install -U peft\n",
+    "#!pip install -U trl"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1fd5f7f5-c053-4ecd-a0d4-b7a12ee32136",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#!huggingface-cli whoami"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5780dde2-c61e-464b-91aa-e68301124b6e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import os\n",
+    "from tqdm import tqdm\n",
+    "import bitsandbytes as bnb\n",
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import transformers\n",
+    "from datasets import Dataset\n",
+    "from peft import LoraConfig, PeftConfig\n",
+    "from trl import SFTTrainer\n",
+    "from trl import setup_chat_format\n",
+    "from transformers import (AutoModelForCausalLM, \n",
+    "                          AutoTokenizer, \n",
+    "                          BitsAndBytesConfig, \n",
+    "                          TrainingArguments, \n",
+    "                          pipeline, \n",
+    "                          logging)\n",
+    "from sklearn.metrics import (accuracy_score, \n",
+    "                             classification_report, \n",
+    "                             confusion_matrix)\n",
+    "from sklearn.model_selection import train_test_split"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "84b29425-b5ad-4852-b9e2-6887eece0de8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import load_dataset\n",
+    "\n",
+    "df = pd.read_parquet(\"hf://datasets/tdavidson/hate_speech_offensive/data/train-00000-of-00001.parquet\")\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b7395daa-b933-4204-854c-472548343f31",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = df.rename(columns={\"class\": \"label\",\"tweet\": \"text\"}).sample(frac=1, random_state=85).reset_index(drop=True).head(3000)\n",
+    "df.loc[:,'label'] = df.loc[:,'label'].replace(0,'Hate')\n",
+    "df.loc[:,'label'] = df.loc[:,'label'].replace(1,'Offensive')\n",
+    "df.loc[:,'label'] = df.loc[:,'label'].replace(2,'Normal')\n",
+    "# Split the DataFrame\n",
+    "train_size = 0.8\n",
+    "eval_size = 0.1\n",
+    "\n",
+    "# Calculate sizes\n",
+    "train_end = int(train_size * len(df))\n",
+    "eval_end = train_end + int(eval_size * len(df))\n",
+    "\n",
+    "# Split the data\n",
+    "X_train = df[:train_end]\n",
+    "X_eval = df[train_end:eval_end]\n",
+    "X_test = df[eval_end:]\n",
+    "# Define the prompt generation functions\n",
+    "def generate_prompt(data_point):\n",
+    "    return f\"\"\"\n",
+    "            Classify the text into Hatespeech, Offensive, Normal and return the answer as the corresponding label.\n",
+    "text: {data_point[\"text\"]}\n",
+    "label: {data_point[\"label\"]}\"\"\".strip()\n",
+    "\n",
+    "def generate_test_prompt(data_point):\n",
+    "    return f\"\"\"\n",
+    "            Classify the text into Hatespeech, Offensive, Normal and return the answer as the corresponding label.\n",
+    "            text: {data_point[\"text\"]}\n",
+    "            label: \"\"\".strip()\n",
+    "\n",
+    "# Generate prompts for training and evaluation data\n",
+    "X_train.loc[:,'text'] = X_train.apply(generate_prompt, axis=1)\n",
+    "X_eval.loc[:,'text'] = X_eval.apply(generate_prompt, axis=1)\n",
+    "\n",
+    "# Generate test prompts and extract true labels\n",
+    "y_true = X_test.loc[:,'label']\n",
+    "X_test = pd.DataFrame(X_test.apply(generate_test_prompt, axis=1), columns=[\"text\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bc18edca-e02b-4a32-8cc1-7d83f00bdba5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_train.label.value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "52d5ccf5-7669-447f-8a90-43cbb7e8e337",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_data = Dataset.from_pandas(X_train[[\"text\"]])\n",
+    "eval_data = Dataset.from_pandas(X_eval[[\"text\"]])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f7732c58-e8c6-436b-810d-40abd4f593ab",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_data['text'][2000]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0f15e59f-9e50-48f1-b6f5-d6dc46db623f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#CHANGE MODEL HERE#\n",
+    "base_model_name = \"meta-llama/Llama-3.2-3B-Instruct\"\n",
+    "\n",
+    "bnb_config = BitsAndBytesConfig(\n",
+    "    load_in_4bit=True,\n",
+    "    bnb_4bit_use_double_quant=False,\n",
+    "    bnb_4bit_quant_type=\"nf4\",\n",
+    "    bnb_4bit_compute_dtype=\"float16\",\n",
+    ")\n",
+    "\n",
+    "model = AutoModelForCausalLM.from_pretrained(\n",
+    "    base_model_name,\n",
+    "    device_map=\"auto\",\n",
+    "    torch_dtype=\"float16\",\n",
+    "    quantization_config=bnb_config, \n",
+    ")\n",
+    "\n",
+    "model.config.use_cache = False\n",
+    "model.config.pretraining_tp = 1\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(base_model_name)\n",
+    "\n",
+    "tokenizer.pad_token_id = tokenizer.eos_token_id"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "97ccf698-09de-4423-9287-8dedf779fc3d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def predict(test, model, tokenizer):\n",
+    "    y_pred = []\n",
+    "    labels = [\"Hate\", \"Offensive\", \"Normal\"]\n",
+    "    \n",
+    "    for i in tqdm(range(len(test))):\n",
+    "        prompt = test.iloc[i][\"text\"]\n",
+    "        pipe = pipeline(task=\"text-generation\", \n",
+    "                        model=model, \n",
+    "                        tokenizer=tokenizer, \n",
+    "                        max_new_tokens=2, \n",
+    "                        temperature=0.1)\n",
+    "        \n",
+    "        result = pipe(prompt)\n",
+    "        answer = result[0]['generated_text'].split(\"label:\")[-1].strip()\n",
+    "        \n",
+    "        # Determine the predicted category\n",
+    "        for label in labels:\n",
+    "            if label.lower() in answer.lower():\n",
+    "                y_pred.append(label)\n",
+    "                break\n",
+    "        else:\n",
+    "            y_pred.append(\"none\")\n",
+    "    \n",
+    "    return y_pred\n",
+    "\n",
+    "y_pred = predict(X_test, model, tokenizer)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2bc4f2ea-5cde-4368-8f92-7883995d8977",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def evaluate(y_true, y_pred):\n",
+    "    labels = [\"Hate\", \"Offensive\", \"Normal\"]\n",
+    "    mapping = {label: idx for idx, label in enumerate(labels)}\n",
+    "    \n",
+    "    def map_func(x):\n",
+    "        return mapping.get(x, -1)  # Map to -1 if not found, but should not occur with correct data\n",
+    "    \n",
+    "    y_true_mapped = np.vectorize(map_func)(y_true)\n",
+    "    y_pred_mapped = np.vectorize(map_func)(y_pred)\n",
+    "    \n",
+    "    # Calculate accuracy\n",
+    "    accuracy = accuracy_score(y_true=y_true_mapped, y_pred=y_pred_mapped)\n",
+    "    print(f'Accuracy: {accuracy:.3f}')\n",
+    "    \n",
+    "    # Generate accuracy report\n",
+    "    unique_labels = set(y_true_mapped)  # Get unique labels\n",
+    "    \n",
+    "    for label in unique_labels:\n",
+    "        label_indices = [i for i in range(len(y_true_mapped)) if y_true_mapped[i] == label]\n",
+    "        label_y_true = [y_true_mapped[i] for i in label_indices]\n",
+    "        label_y_pred = [y_pred_mapped[i] for i in label_indices]\n",
+    "        label_accuracy = accuracy_score(label_y_true, label_y_pred)\n",
+    "        print(f'Accuracy for label {labels[label]}: {label_accuracy:.3f}')\n",
+    "        \n",
+    "    # Generate classification report\n",
+    "    class_report = classification_report(y_true=y_true_mapped, y_pred=y_pred_mapped, target_names=labels, labels=list(range(len(labels))))\n",
+    "    print('\\nClassification Report:')\n",
+    "    print(class_report)\n",
+    "    \n",
+    "    # Generate confusion matrix\n",
+    "    conf_matrix = confusion_matrix(y_true=y_true_mapped, y_pred=y_pred_mapped, labels=list(range(len(labels))))\n",
+    "    print('\\nConfusion Matrix:')\n",
+    "    print(conf_matrix)\n",
+    "\n",
+    "evaluate(y_true, y_pred)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

run.ipynb ADDED Viewed

	@@ -0,0 +1,127 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "id": "b368a208-7b0f-4928-aad6-94030a47d573",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "6d72bc7458d64ec7af180321e7d9d7aa",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "###load models\n",
+    "base_model = \"meta-llama/Llama-3.2-3B-Instruct\"\n",
+    "fine_tuned_model = \"/home/marco/llama-3.2-instruct-offensive-classification-1.0.0\"\n",
+    "\n",
+    "from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline\n",
+    "from peft import PeftModel\n",
+    "import torch\n",
+    "\n",
+    "\n",
+    "# Reload tokenizer and model\n",
+    "tokenizer = AutoTokenizer.from_pretrained(fine_tuned_model)\n",
+    "\n",
+    "model = AutoModelForCausalLM.from_pretrained(\n",
+    "        fine_tuned_model,\n",
+    "        return_dict=True,\n",
+    "        low_cpu_mem_usage=True,\n",
+    "        torch_dtype=torch.float16,\n",
+    "        device_map=\"auto\",\n",
+    "        trust_remote_code=True,\n",
+    "        offload_buffers=True\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "id": "54e39123-1ed6-4990-8295-6df1e0563fc5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "text = \"You are a pig!\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "id": "1b68121f-3215-46f6-901b-406be4e05a06",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Device set to use cpu\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Offensive\n"
+     ]
+    }
+   ],
+   "source": [
+    "###Start Prompt\n",
+    "prompt = f\"\"\"Classify the text into Hatespeech, Offensive, Normal and return the answer as the corresponding label.\n",
+    "text: {text}\n",
+    "label: \"\"\".strip()\n",
+    "\n",
+    "pipe = pipeline(\n",
+    "    \"text-generation\",\n",
+    "    model=model,\n",
+    "    tokenizer=tokenizer,\n",
+    "    torch_dtype=torch.float16,\n",
+    "    device_map=\"auto\"\n",
+    ")\n",
+    "\n",
+    "outputs = pipe(prompt, max_new_tokens=2, do_sample=True, temperature=0.1, pad_token_id=tokenizer.eos_token_id)\n",
+    "print(outputs[0][\"generated_text\"].split(\"label: \")[-1].strip())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d709317d-b9cf-4590-9caf-ac74842f6be2",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}