Spaces:

wangzerui
/

Job-Skills-Analyzer

Build error

File size: 7,659 Bytes

6eb2110

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-11-25T14:59:03.066893917Z",
     "start_time": "2023-11-25T14:59:02.924638197Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/z/miniconda3/envs/llama/lib/python3.10/site-packages/transformers/models/auto/auto_factory.py:472: FutureWarning: The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "6745f1964cda44068721c6c8b5f91eee",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/z/miniconda3/envs/llama/lib/python3.10/site-packages/transformers/utils/hub.py:374: FutureWarning: The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.\n",
      "  warnings.warn(\n"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig\n",
    "\n",
    "# Define the base model ID\n",
    "base_model_id = \"meta-llama/Llama-2-13b-hf\"\n",
    "\n",
    "# Create a BitsAndBytesConfig object with the corrected settings\n",
    "quantization_config = BitsAndBytesConfig(\n",
    "    load_in_4bit=True,\n",
    "    bnb_4bit_use_double_quant=True,\n",
    "    bnb_4bit_quant_type=\"nf4\",\n",
    "    bnb_4bit_compute_dtype=torch.bfloat16,\n",
    "    load_in_8bit_fp32_cpu_offload=True  # Set as suggested in the error\n",
    ")\n",
    "\n",
    "# Load the base model with the updated quantization configuration\n",
    "# Adjust 'device_map' based on your system's GPU configuration\n",
    "base_model = AutoModelForCausalLM.from_pretrained(\n",
    "    base_model_id,  \n",
    "    quantization_config=quantization_config,\n",
    "    trust_remote_code=True,\n",
    "    use_auth_token=True\n",
    ")\n",
    "\n",
    "# Load the tokenizer\n",
    "tokenizer = AutoTokenizer.from_pretrained(base_model_id, add_bos_token=True, trust_remote_code=True)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "_BxOhAiqyRgp"
   },
   "source": [
    "Now load the QLoRA adapter from the appropriate checkpoint directory, i.e. the best performing model checkpoint:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-11-25T14:59:12.830783738Z",
     "start_time": "2023-11-25T14:59:12.826615170Z"
    },
    "id": "GwsiqhWuyRgp"
   },
   "outputs": [],
   "source": [
    "from peft import PeftModel\n",
    "\n",
    "ft_model = PeftModel.from_pretrained(base_model, \"checkpoint-2800\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import load_dataset\n",
    "\n",
    "  \n",
    "eval_dataset = load_dataset('json', data_files='/home/z/Music/LLAMA/llama/IPG/datasets/new_test_data.json', split='train')\n",
    "\n",
    "\n",
    "def formatting_func(example):\n",
    "    text = f\"### The job description: {example['text']}\\n ### The skills: \"\n",
    "    return text\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "\n",
    "def run_finetune_model(model_id):\n",
    "\n",
    "    example = eval_dataset.filter(lambda x: x['id'] == model_id)[0]\n",
    "    formatted_text = formatting_func(example)\n",
    "    \n",
    "    #print(formatted_text)\n",
    "    model_input = tokenizer(formatted_text, return_tensors=\"pt\").to(\"cuda\")\n",
    "\n",
    "\n",
    "    ft_model.eval()\n",
    "    with torch.no_grad():\n",
    "        output_tokens = ft_model.generate(**model_input, max_new_tokens=200)[0]\n",
    "        generated_text = tokenizer.decode(output_tokens, skip_special_tokens=True)\n",
    "    \n",
    "    print(generated_text)\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "### The job description: German BD Manager\n",
      "Job Description：\n",
      "1、Represent the company to develop new partners for energy storage system；\n",
      "2、Maintain good relationship and help partners to develop/grow the business；\n",
      "3、Formulate a strategy and target for the market exploration so as to achieve good performance；\n",
      "4、Pay attention and collect information for the latest development/tendency in the industry as well as getting feedback/insight to R&D；\n",
      "5、Advice and assist the company to build a strong local team including but not limited to after sale service, technical support, sales and marketing.\n",
      " \n",
      "Job Requirements：\n",
      "1、Fluent in English and German；\n",
      "2、5+ years of experience in the industry of Energy Storage System, a good education background will be preferential；\n",
      "3、Strong execution and result-oriented, attach importance to details and critical thinking as well as desire to progress/evolve；\n",
      "4、Open-minded and teamwork, great skills in communication.\n",
      " ### The skills:  ['programming', 'simulation', 'communication', 'excel', 'word', 'powerpoint', 'marketing', 'c++', 'matlab', 'html', 'data analysis', 'powerpoint', 'communication', 'project management', 'excel', 'microsoft office', 'tableau', 'powerpoint', 'word', 'microsoft office', 'communication', 'python', 'excel', 'microsoft office', 'c++', 'python', 'data analysis', 'python', 'html', 'data analysis', 'communication', 'microsoft office', 'java', 'powerpoint']\n",
      " ### The qualifications: \n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "run_finetune_model(\"19010\")\n"
   ]
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "gpuType": "T4",
   "provenance": []
  },
  "gpuClass": "standard",
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}