File size: 6,372 Bytes

{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "source": [
        "This Notebook is a Stable-diffusion tool which allows you to find similiar tokens from the SD 1.5 vocab.json that you can use for text-to-image generation"
      ],
      "metadata": {
        "id": "L7JTcbOdBPfh"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# Load the tokens into the colab\n",
        "!git clone https://huggingface.co/datasets/codeShare/sd_tokens\n",
        "import torch\n",
        "from torch import linalg as LA\n",
        "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
        "%cd /content/sd_tokens\n",
        "token = torch.load('sd15_tensors.pt', map_location=device, weights_only=True)\n",
        "#-----#\n",
        "\n",
        "#Import the vocab.json\n",
        "import json\n",
        "import pandas as pd\n",
        "with open('vocab.json', 'r') as f:\n",
        "    data = json.load(f)\n",
        "\n",
        "_df = pd.DataFrame({'count': data})['count']\n",
        "\n",
        "vocab = {\n",
        "    value: key for key, value in _df.items()\n",
        "}\n",
        "#-----#\n",
        "\n",
        "# Define functions/constants\n",
        "NUM_TOKENS = 49407\n",
        "\n",
        "def absolute_value(x):\n",
        "    return max(x, -x)\n",
        "\n",
        "def similarity(id_A , id_B):\n",
        "  #Tensors\n",
        "  A = token[id_A]\n",
        "  B = token[id_B]\n",
        "  #Tensor vector length (2nd order, i.e (a^2 + b^2 + ....)^(1/2)\n",
        "  _A = LA.vector_norm(A, ord=2)\n",
        "  _B = LA.vector_norm(B, ord=2)\n",
        "  #----#\n",
        "  result = torch.dot(A,B)/(_A*_B)\n",
        "  similarity_pcnt = absolute_value(result.item()*100)\n",
        "  similarity_pcnt_aprox = round(similarity_pcnt, 3)\n",
        "  result = f'{similarity_pcnt_aprox} %'\n",
        "  return result\n",
        "#----#"
      ],
      "metadata": {
        "id": "Ch9puvwKH1s3"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "print(vocab[8922]) #the vocab item for ID 8922\n",
        "print(token[8922].shape)  #dimension of the token"
      ],
      "metadata": {
        "id": "S_Yh9gH_OUA1"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "Get the IDs from a prompt text.\n",
        "\n",
        "The prompt will be enclosed with the <|start-of-text|> and <|end-of-text|> tokens"
      ],
      "metadata": {
        "id": "f1-jS7YJApiO"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "\n",
        "from transformers import AutoTokenizer\n",
        "tokenizer = AutoTokenizer.from_pretrained(\"openai/clip-vit-large-patch14\", clean_up_tokenization_spaces = False)\n",
        "prompt= \"banana\" # @param {type:'string'}\n",
        "tokenizer_output = tokenizer(text = prompt)\n",
        "input_ids = tokenizer_output['input_ids']\n",
        "print(input_ids)"
      ],
      "metadata": {
        "id": "RPdkYzT2_X85"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "#Produce a list id IDs that are most similiar to the prompt ID at positiion 1\n",
        "\n",
        "id_A = input_ids[1]\n",
        "A = token[id_A]\n",
        "_A = LA.vector_norm(A, ord=2)\n",
        "dots = torch.zeros(NUM_TOKENS)\n",
        "\n",
        "for index in range(NUM_TOKENS):\n",
        "  id_B = index\n",
        "  B = token[id_B]\n",
        "  _B = LA.vector_norm(B, ord=2)\n",
        "  result = torch.dot(A,B)/(_A*_B)\n",
        "  result = absolute_value(result.item())\n",
        "  dots[index] = result\n",
        "\n",
        "sorted, indices = torch.sort(dots,dim=0 , descending=True)\n",
        "#----#\n",
        "print(f'Calculated all cosine-similarities between the token {vocab[id_A]} with ID = {id_A} the rest of the {NUM_TOKENS} tokens as a 1x{sorted.shape[0]} tensor')\n",
        "print(f'Calculated indices as a 1x{indices.shape[0]} tensor')"
      ],
      "metadata": {
        "id": "juxsvco9B0iV"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "list_size = 100 # @param {type:'number'}\n",
        "\n",
        "print_ID = False # @param {type:\"boolean\"}\n",
        "print_Similarity = True # @param {type:\"boolean\"}\n",
        "print_Name = True # @param {type:\"boolean\"}\n",
        "print_Divider = False # @param {type:\"boolean\"}\n",
        "\n",
        "for index in range(list_size):\n",
        "  id = indices[index].item()\n",
        "  if (print_Name):\n",
        "    print(f'{vocab[id]}') # vocab item\n",
        "  if (print_ID):\n",
        "    print(f'ID = {id}') # IDs\n",
        "  if (print_Similarity):\n",
        "    print(f'similiarity = {round(sorted[index].item()*100,2)} %') # % value\n",
        "  if (print_Divider):\n",
        "    print('--------')"
      ],
      "metadata": {
        "id": "YIEmLAzbHeuo"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "Find the most similiar Tokens for given input"
      ],
      "metadata": {
        "id": "qqZ5DvfLBJnw"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "Valid ID ranges for id_for_token_A / id_for_token_B are between 0 and 49407"
      ],
      "metadata": {
        "id": "kX72bAuhOtlT"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "id_for_token_A = 4567 # @param {type:'number'}\n",
        "id_for_token_B = 4343 # @param {type:'number'}\n",
        "\n",
        "similarity_str =  'The similarity between tokens A and B is ' + similarity(id_for_token_A , id_for_token_B)\n",
        "\n",
        "print(similarity_str)"
      ],
      "metadata": {
        "id": "MwmOdC9cNZty"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}