{ "cells": [ { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import torch\n", "import torch.nn.functional as F\n", "from transformers import AutoTokenizer, AutoModel\n", "import os\n", "import torch\n", "import torch.nn as nn\n", "import torch.optim as optim\n", "import torch.nn.functional as F\n", "from huggingface_hub import PyTorchModelHubMixin" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# Load model directly\n", " \n", "tokenizer = AutoTokenizer.from_pretrained(\"sentence-transformers/all-MiniLM-L6-v2\")\n", "model = AutoModel.from_pretrained(\"sentence-transformers/all-MiniLM-L6-v2\")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "43\n" ] } ], "source": [ "import os\n", "\n", "# Directory containing the job files\n", "jobs_dir = 'jobs'\n", "\n", "# List to store the contents of the txt files with labels\n", "dataset = []\n", "\n", "# Walk through the directory\n", "for root, dirs, files in os.walk(jobs_dir):\n", " for file in files:\n", " if file.endswith('.txt'):\n", " file_path = os.path.join(root, file)\n", " with open(file_path, 'r') as f:\n", " txt = f.read()\n", " label = 0 if 'disliked' in root else 1\n", " dataset.append((txt, label))\n", "\n", "# Print the number of txt files\n", "print(len(dataset))" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "import random\n", "txts = [txt for txt, label in dataset]\n", "labels = [label for txt, label in dataset]\n", "\n", "# Generate a list of indices and shuffle them\n", "indices = list(range(len(txts)))\n", "random.shuffle(indices)\n", "\n", "# Apply the shuffled indices to txts and labels\n", "txts = [txts[i] for i in indices]\n", "labels = [labels[i] for i in indices]" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# Tokenize sentences\n", "# text = [\"Hello, my dog is cute\", \"Hello, my cat is cute\"]\n", "encoded_input = tokenizer(txts, padding=True, truncation=True, return_tensors='pt')\n", "\n", "# Compute token embeddings\n", "with torch.no_grad():\n", " model_output = model(**encoded_input)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "def mean_pooling(model_output, attention_mask):\n", " token_embeddings = model_output[0] #First element of model_output contains all token embeddings\n", " input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()\n", " return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)\n", "\n", "\n", "# Perform pooling\n", "sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])\n", "\n", "# Normalize embeddings\n", "sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "torch.Size([43, 384])" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sentence_embeddings.size()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[1,\n", " 0,\n", " 1,\n", " 0,\n", " 0,\n", " 1,\n", " 1,\n", " 0,\n", " 1,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 1,\n", " 0,\n", " 0,\n", " 0,\n", " 1,\n", " 1,\n", " 0,\n", " 0,\n", " 1,\n", " 0,\n", " 1,\n", " 1,\n", " 1,\n", " 0,\n", " 1,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 1,\n", " 0,\n", " 0,\n", " 0,\n", " 0]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "labels" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch [5/20], Loss: 0.6616\n", "Epoch [10/20], Loss: 0.6401\n", "Epoch [15/20], Loss: 0.6221\n", "Epoch [20/20], Loss: 0.6074\n", "Training complete.\n" ] } ], "source": [ "\n", "\n", "# Define the neural network\n", "class TwoLayerNN(nn.Module, PyTorchModelHubMixin):\n", " def __init__(self, input_dim, hidden_dim, output_dim):\n", " super(TwoLayerNN, self).__init__()\n", " self.fc1 = nn.Linear(input_dim, hidden_dim)\n", " self.relu = nn.ReLU()\n", " self.fc2 = nn.Linear(hidden_dim, output_dim)\n", " self.sigmoid = nn.Sigmoid()\n", "\n", " def forward(self, x):\n", " out = self.fc1(x)\n", " out = self.relu(out)\n", " out = self.fc2(out)\n", " out = self.sigmoid(out)\n", " return out\n", "\n", "# Hyperparameters\n", "input_dim = sentence_embeddings.size(1)\n", "hidden_dim = 128\n", "output_dim = 1\n", "num_epochs = 20\n", "learning_rate = 0.001\n", "\n", "# Model, loss function, and optimizer\n", "classifier = TwoLayerNN(input_dim, hidden_dim, output_dim)\n", "criterion = nn.BCELoss()\n", "optimizer = optim.Adam(classifier.parameters(), lr=learning_rate)\n", "\n", "# Convert labels to tensor\n", "labels_tensor = torch.tensor(labels, dtype=torch.float32).unsqueeze(1)\n", "\n", "# Training loop\n", "for epoch in range(num_epochs):\n", " classifier.train()\n", " optimizer.zero_grad()\n", " outputs = classifier(sentence_embeddings)\n", " loss = criterion(outputs, labels_tensor)\n", " loss.backward()\n", " optimizer.step()\n", "\n", " if (epoch+1) % 5 == 0:\n", " print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')\n", "\n", "print(\"Training complete.\")" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Job description: A very fun job with data science and machine learning\n", "Prediction: liked (score: 0.5050)\n" ] } ], "source": [ "# Inference\n", "classifier.eval()\n", "job_description = \"A very fun job with data science and machine learning\"\n", "encoded_input = tokenizer(job_description, padding=True, truncation=True, return_tensors='pt')\n", "with torch.no_grad():\n", " model_output = model(**encoded_input)\n", "sentence_embedding = mean_pooling(model_output, encoded_input['attention_mask'])\n", "sentence_embedding = F.normalize(sentence_embedding, p=2, dim=1)\n", "output = classifier(sentence_embedding)\n", "prediction = 'liked' if output.item() > 0.5 else 'disliked'\n", "print(f\"Job description: {job_description}\")\n", "print(f\"Prediction: {prediction} (score: {output.item():.4f})\")" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "from huggingface_hub import HfApi, HfFolder\n", "\n", "# Save the model and tokenizer\n", "classifier.save_pretrained(\"job-classifier\")\n", "tokenizer.save_pretrained(\"job-classifier\")\n", "\n", "# Initialize the HfApi\n", "api = HfApi()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "No files have been modified since last commit. Skipping to prevent empty commit.\n" ] }, { "data": { "text/plain": [ "CommitInfo(commit_url='https://huggingface.co/Robzy/job-classifier/commit/fbe58c86c6d0859305675ac93f155fef7462a58d', commit_message='Upload model', commit_description='', oid='fbe58c86c6d0859305675ac93f155fef7462a58d', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Robzy/job-classifier', endpoint='https://huggingface.co', repo_type='model', repo_id='Robzy/job-classifier'), pr_revision=None, pr_num=None)" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.push_to_hub(\"Robzy/job-classifier\")" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "input_dim = 384\n", "hidden_dim = 128\n", "output_dim = 1\n", "classifier = TwoLayerNN.from_pretrained(\"Robzy/job-classifier\", input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.8" } }, "nbformat": 4, "nbformat_minor": 2 }